python pipeline does not execute imputer - python

I analysing the gapminder dataset [1] using a pipeline in Python but for some reason the imputer does not replace the nan values. According to the documentation ("For missing values encoded as np.nan, use the string value “NaN”.") I should do it like below but the code crashes with "ValueError: Input contains NaN" in the line "gm_cv.fit(X_train, y_train)". But gm_cv was created based on the pipeline and the pipeline contains the imputation which should remove the nans. Why does this not work?
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
fn = 'gapminder.csv'
df = pd.read_csv(fn, delimiter=',')
# replace empty strings with numpy nans
df.replace('', np.nan, inplace=True)
df.replace(' ', np.nan, inplace=True)
targetVariable = 'lifeexpectancy'
X = df.drop([targetVariable, 'country'], axis=1).values
y = df[targetVariable]
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
('scaler', StandardScaler()),
('elasticnet', ElasticNet())]
pipeline = Pipeline(steps)
# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio': np.linspace(0,1,30)}
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=98)
# Create the GridSearchCrossValidation object
gm_cv = GridSearchCV(pipeline, parameters, cv=3)
# Fit to the training set
gm_cv.fit(X_train, y_train)
# results:
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
stack trace of full error:
python.exe pipline_and_classification_II.py
Traceback (most recent call last):
File "pipline_and_classification_II.py", line 55, in <module>
gm_cv.fit(X_train, y_train)
File "lib\site-packages\sklearn\model_selection\_search.py", line 639, in fit
cv.split(X, y, groups)))
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "lib\site-packages\sklearn\model_selection\_validation.py", line 458, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "lib\site-packages\sklearn\pipeline.py", line 250, in fit
self._final_estimator.fit(Xt, y, **fit_params)
File "lib\site-packages\sklearn\linear_model\coordinate_descent.py", line 709, in fit
ensure_2d=False)
File "lib\site-packages\sklearn\utils\validation.py", line 453, in check_array
_assert_all_finite(array)
File "lib\site-packages\sklearn\utils\validation.py", line 44, in _assert_all_finite
" or a value too large for %r." % X.dtype)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Process finished with exit code 1
Update:
Debugging it slowly (without the pipeline) shows that the Imputer does not like 1d arrays (like y in my code above). When doing the nan-removing manually before with the code below it works.
y = np.array(y)
idx = np.argwhere(np.isnan(y))
y[idx] = np.nanmean(y)
But this defeats the purpose of the pipeline. Any ideas how to get this running without manual tinkering?
[1] http://makemeanalyst.com/download-and-learn-about-gapminder-dataset/

Related

How can I use a dataframe of multi-value in each cell as an input to machine learning for classification

I build a data frame with multivalued in each cell as picture below
and I want to use logistic regression for classification>>>>
my code is :
fds1 = pd.DataFrame(featuresdata)
fds1.fillna('', inplace=True)
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(fds1, y, test_size=0.30, random_state=100)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy for logistic regression:", score)
but there was an error with this code:
File "C:\Users\hp\PycharmProjects\pythonProject\FE2.py", line 317, in CLS2butclick
classifier.fit(X_train, y_train)
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1138, in fit
X, y = self._validate_data(
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\base.py", line 596, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y
X = check_array(
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
array = np.asarray(array, order=order, dtype=dtype)
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\generic.py", line 2064, in __array__
return np.asarray(self._values, dtype=dtype)
ValueError: setting an array element with a sequence.
How to fix that?
You need to do a label encoding before the training and convert string values to make them understandable for machine.
Refer to https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

I am trying to predict using SVM but I receive the error
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
when executing line text_clf.fit(X_train,y_train) of my code. How to fix this and get the probability that my prediction is correct using SVM?
I am predicting the first column (gold) of my input file based on the values of the remaining columns. My input file dataExtended.txtis under the form:
gold,T-x-T,T-x-N,T-x-U,T-x-NT,T-x-UT,T-x-UN,T-x-UNT,N-x-T,N-x-N,N-x-U,N-x-NT,N-x-UT,N-x-UN,N-x-UNT,U-x-T,U-x-N,U-x-U,U-x-NT,U-x-UT,U-x-UN,U-x-UNT,NT-x-T,NT-x-N,NT-x-U,NT-x-NT,NT-x-UT,NT-x-UN,NT-x-UNT,UT-x-T,UT-x-N,UT-x-U,UT-x-NT,UT-x-UT,UT-x-UN,UT-x-UNT,UN-x-T,UN-x-N,UN-x-U,UN-x-NT,UN-x-UT,UN-x-UN,UN-x-UNT,UNT-x-T,UNT-x-N,UNT-x-U,UNT-x-NT,UNT-x-UT,UNT-x-UN,UNT-x-UNT,T-T-x,T-N-x,T-U-x,T-NT-x,T-UT-x,T-UN-x,T-UNT-x,N-T-x,N-N-x,N-U-x,N-NT-x,N-UT-x,N-UN-x,N-UNT-x,U-T-x,U-N-x,U-U-x,U-NT-x,U-UT-x,U-UN-x,U-UNT-x,NT-T-x,NT-N-x,NT-U-x,NT-NT-x,NT-UT-x,NT-UN-x,NT-UNT-x,UT-T-x,UT-N-x,UT-U-x,UT-NT-x,UT-UT-x,UT-UN-x,UT-UNT-x,UN-T-x,UN-N-x,UN-U-x,UN-NT-x,UN-UT-x,UN-UN-x,UN-UNT-x,UNT-T-x,UNT-N-x,UNT-U-x,UNT-NT-x,UNT-UT-x,UNT-UN-x,UNT-UNT-x,x-T-T,x-T-N,x-T-U,x-T-NT,x-T-UT,x-T-UN,x-T-UNT,x-N-T,x-N-N,x-N-U,x-N-NT,x-N-UT,x-N-UN,x-N-UNT,x-U-T,x-U-N,x-U-U,x-U-NT,x-U-UT,x-U-UN,x-U-UNT,x-NT-T,x-NT-N,x-NT-U,x-NT-NT,x-NT-UT,x-NT-UN,x-NT-UNT,x-UT-T,x-UT-N,x-UT-U,x-UT-NT,x-UT-UT,x-UT-UN,x-UT-UNT,x-UN-T,x-UN-N,x-UN-U,x-UN-NT,x-UN-UT,x-UN-UN,x-UN-UNT,x-UNT-T,x-UNT-N,x-UNT-U,x-UNT-NT,x-UNT-UT,x-UNT-UN,x-UNT-UNT,callersAtLeast1T,CalleesAtLeast1T,callersAllT,calleesAllT,CallersAtLeast1N,CalleesAtLeast1N,CallersAllN,CalleesAllN,childrenAtLeast1T,parentsAtLeast1T,childrenAtLeast1N,parentsAtLeast1N,childrenAllT,parentsAllT,childrenAllN,ParentsAllN,ParametersatLeast1T,FieldMethodsAtLeast1T,ReturnTypeAtLeast1T,ParametersAtLeast1N,FieldMethodsAtLeast1N,ReturnTypeN,ParametersAllT,FieldMethodsAllT,ParametersAllN,FieldMethodsAllN,ClassGoldN,ClassGoldT,Inner,Leaf,Root,Isolated,EmptyCallers,EmptyCallees,EmptyCallersCallers,EmptyCalleesCallees,Program,Requirement,MethodID
T,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,6,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,7,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,8,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,3
Here is my full reproducible code:
# Make Predictions with Naive Bayes On The Iris Dataset
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
data = pd.read_csv( 'dataExtended.txt', sep= ',')
row_count, column_count = data.shape
# Printing the dataswet shape
print ("Dataset Length: ", len(data))
print ("Dataset Shape: ", data.shape)
print("Number of columns ", column_count)
# Printing the dataset obseravtions
print ("Dataset: ",data.head())
data['gold'] = data['gold'].astype('category').cat.codes
data['Program'] = data['Program'].astype('category').cat.codes
# Building Phase Separating the target variable
X = data.values[:, 1:column_count]
Y = data.values[:, 0]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
#Create a svm Classifier
svclassifier = svm.LinearSVC()
print('Before fitting')
svclassifier.fit(X_train, y_train)
predicted = svclassifier.predict(X_test)
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(X_train,y_train)
Traceback leading to error:
Traceback (most recent call last):
File "<ipython-input-9-8e85a0a9f81c>", line 1, in <module>
runfile('C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py', wdir='C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python')
File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 668, in runfile
execfile(filename, namespace)
File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py", line 53, in <module>
text_clf.fit(X_train,y_train)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1381, in fit_transform
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 792, in _count_vocab
for feature in analyze(doc):
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 266, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 232, in <lambda>
return lambda x: strip_accents(x.lower())
You cannot use TF-IDF-related methods for numeric data; the method is exclusively for use with text data, hence it uses methods such as .tolower(), which are by default applicable to strings, hence the error. This is already apparent from the documentation:
fit(self, raw_documents, y=None)
Learn vocabulary and idf from training set.
Parameters
raw_documents: iterable
An iterable which yields either str, unicode or file objects.
I am afraid that your rationale, as explained in the comments:
I'm just trying to get the probability that each prediction is correct and TF-IDF seems to be the only way to do so when using SVM
is extremely weak. For starters, there is no such thing as "the probability that each prediction is correct" - I take it that you mean probabilistic predictions, in contrast to hard class predictions (see Predict classes or class probabilities?)
To get to the point of your actual requirement: in contrast to LinearSVC, which you are using here, SVC does indeed provide a predict_proba method, which should do the job (see the docs and the instructions therein). Notice that LinearSVC is not actually an SVM - see answer in Under what parameters are SVC and LinearSVC in scikit-learn equivalent? for details.
In short, forget about TF-IDF and switch to SVC instead of LinearSVC.

Incompatible dimension for X and Y matrices

I was wondering what i have wrong here i get the error
Traceback (most recent call last):
File "main.py", line 37, in <module>
y_pred = knn.predict(X_test)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/neighbors/classification.py", line149, in predict
neigh_dist, neigh_ind = self.kneighbors(X)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/neighbors/base.py", line 434, in kneighbors
**kwds))
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1448, in pairwise_distances_chunked
n_jobs=n_jobs, **kwds)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1588, in pairwise_distances
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1206, in _parallel_pairwise
return func(X, Y, **kwds)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 232, ineuclidean_distances
X, Y = check_pairwise_arrays(X, Y)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 125, incheck_pairwise_arrays
X.shape[1], Y.shape[1]))
ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 38 while Y.shape[1] == 43
I'm new to ai and cant find anything on the internet that really solves this problem, any comment appreciated. This is my code
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
fileName = "breast-cancer-fixed.csv";
df = pd.read_csv(fileName)
X = df[df.columns[:-1]]
y = df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
X_train = OneHotEncoder().fit_transform(X_train)
X_test = OneHotEncoder().fit_transform(X_test)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("kNN model accuracy:", metrics.accuracy_score(y_test, y_pred))
My csv is massive and i cant upload it here so i put a small snippet in
age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
50-59,premeno,25-29,3-5,no,2,right,left_up,yes,no-recurrence-events
50-59,ge40,40-44,0-2,no,3,left,left_up,no,no-recurrence-events
40-49,premeno,10-14,0-2,no,2,left,left_up,no,no-recurrence-events
40-49,premeno,0-4,0-2,no,2,right,right_low,no,no-recurrence-events
40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,no-recurrence-events
50-59,premeno,25-29,0-2,no,2,left,left_low,no,no-recurrence-events
60-69,ge40,15-19,0-2,no,2,right,left_up,no,no-recurrence-events
Also if i get rid of the last two line of code ( the prediction code ) it runs fine with no errors
trying adding this line anywhere above the transforms
enc = OneHotEncoder(handle_unknown='ignore')
then change the transform lines to the following
enc = enc.fit(X_train)
X_train = enc.transform(X_train)
X_test = enc.transform(X_test)
I get this error
```Traceback (most recent call last):
File "main.py", line 25, in <module>
X_test = OneHotEncoder().transform(X_test)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py", line 726, in transform
check_is_fitted(self, 'categories_')
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 914, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.exceptions.NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.```

GradientBoostingClassifier implementation

I want to implement Gradient Boosting Classifier to my Titanic ML solution based on sklearn library.
I use VS Code on Ubuntu 18.04.
I've tried:
# Splitting the Training Data
from sklearn.model_selection import train_test_split
predictors = train.drop(['Survived', 'PassengerId'], axis=1)
target = train["Survived"]
x_train, x_val, y_train, y_val = train_test_split(predictors,
target, test_size = 0.22, random_state = 0)
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier()
gbk.fit(x_train, y_train)
..which returns:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/sj/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/gradient_boosting.py", line 1395, in fit
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
File "/home/sj/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 756, in check_X_y
estimator=estimator)
File "/home/sj/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 527, in check_array
array = np.asarray(array, dtype=dtype, order=order)
File "/home/sj/anaconda3/lib/python3.7/site-packages/numpy/core/numeric.py", line 501, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'Baby'
Help would be appreciated. I'm quite new to DS.
I think you may a non numerical values in your train data. Your classifier can take numerical inputs. That's why it tries to convert a string, here 'Baby', to a float. As this operation is not supported, it fails.
Maybe look again at your data.

Editing entire row in .csv using Pandas and feeding to KNeighborsClassifier

I am new to applied machine learning and there is this dataset which includes a column of percentage of cocoa present in a chocolate. But when I feed that column to fit() function of the KNeighborsClassifer it throws the following error;
Traceback (most recent call last):
File "/home/himanshu/ML Tut-2/ML_tut2.py", line 13, in <module>
knn.fit(X_train, y_train)
File "/usr/local/lib/python2.7/dist-packages/sklearn/neighbors/base.py", line 765, in fit
X, y = check_X_y(X, y, "csr", multi_output=True)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 542, in check_X_y
ensure_min_features, warn_on_dtype, estimator)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 402, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
ValueError: invalid literal for float(): 72%
My code is this;
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
choco = pd.read_csv('flavors_of_cacao.csv')
X = choco['Cocoa']
y = choco['Name']
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=0)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
It is clear that the fit() function needs a float number in the cocoa column but it is getting '%' symbol along with the number which cannot be converted to float without manipulations.
Please help me to fix this problem.
EDIT:
I have edited my CSV and removed the '%' signs from it, but now I am getting the following error;
Traceback (most recent call last):
File "/home/himanshu/ML Tut-2/ML_tut2.py", line 14, in <module>
knn.fit(X_train, y_train)
File "/usr/local/lib/python2.7/dist-packages/sklearn/neighbors/base.py", line 765, in fit
X, y = check_X_y(X, y, "csr", multi_output=True)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 552, in check_X_y
check_consistent_length(X, y)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 173, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [1346, 449]
The new code is;
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
df = pd.read_csv('chocos.csv')
X = df[['Cocoa']]
y = df['Name']
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=0)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
For reference, the dataset is this.
Just use the values from that column without the percent symbol:
X = [[float(val.replace('%',''))] for val in choco['Cocoa']]

Categories

Resources