Could not convert string to float in jupyter notebook - python

I am trying to make a ML model , but I am having problems with this one feature. The error given is saying , cannot convert string to float. I tried using a convert method but it is still not working. This code tries to make a ml model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
crime_data= pd.read_csv('Crime_Data_from_2020_to_Present.csv')
#offense= crime_data['Crm Cd Desc']
myData= crime_data.drop(columns=['DR_NO','Date Rptd','Rpt Dist No','Part 1-2','Crm Cd','Mocodes','Vict Age','Vict Sex','Vict Descent','Premis Desc','Weapon Used Cd','Weapon Desc','Status','Status Desc','Crm Cd 1','Crm Cd 2','Crm Cd 3','Crm Cd 4','Cross Street','Premis Cd'])
myData['DATE OCC'] = myData['DATE OCC'].astype(float)#method for converting not working
X= myData.drop (columns=['AREA NAME']) #input data
y= myData['AREA NAME'] #output data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
classifier = RandomForestClassifier(n_estimators = 50)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred
This is the error I'm getting
ValueError Traceback (most recent call last)
<ipython-input-16-20d49933ca7e> in <module>
9 #offense= crime_data['Crm Cd Desc']
10 myData= crime_data.drop(columns=['DR_NO','Date Rptd','Rpt Dist No','Part 1-2','Crm Cd','Mocodes','Vict Age','Vict Sex','Vict Descent','Premis Desc','Weapon Used Cd','Weapon Desc','Status','Status Desc','Crm Cd 1','Crm Cd 2','Crm Cd 3','Crm Cd 4','Cross Street','Premis Cd'])
---> 11 myData['DATE OCC'] = myData['DATE OCC'].astype(float)#method for converting not working
12
13 X= myData.drop (columns=['AREA NAME']) #input data
~\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5696 else:
5697 # else, only a single dtype is given
-> 5698 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors)
5699 return self._constructor(new_data).__finalize__(self)
5700
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
580
581 def astype(self, dtype, copy: bool = False, errors: str = "raise"):
--> 582 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
583
584 def convert(self, **kwargs):
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, filter, **kwargs)
440 applied = b.apply(f, **kwargs)
441 else:
--> 442 applied = getattr(b, f)(**kwargs)
443 result_blocks = _extend_blocks(applied, result_blocks)
444
~\anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
623 vals1d = values.ravel()
624 try:
--> 625 values = astype_nansafe(vals1d, dtype, copy=True)
626 except (ValueError, TypeError):
627 # e.g. astype_nansafe can fail on object-dtype of strings
~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
895 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
896 # Explicit copy, or required since NumPy can't view from / to object.
--> 897 return arr.astype(dtype, copy=True)
898
899 return arr.view(dtype)
ValueError: could not convert string to float: '01/08/2020 12:00:00 AM'
I changed the code to this
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
crime_data= pd.read_csv('Crime_Data_from_2020_to_Present.csv')
#offense= crime_data['Crm Cd Desc']
myData= crime_data.drop(columns=['DR_NO','Date Rptd','Rpt Dist No','Part 1-2','Crm Cd','Mocodes','Vict Age','Vict Sex','Vict Descent','Premis Desc','Weapon Used Cd','Weapon Desc','Status','Status Desc','Crm Cd 1','Crm Cd 2','Crm Cd 3','Crm Cd 4','Cross Street','Premis Cd'])
myData['DATE OCC'] = pd.to_datetime(myData['DATE OCC'])#method for converting not working
X= myData.drop (columns=['AREA NAME']) #input data
y= myData['AREA NAME'] #output data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
classifier = RandomForestClassifier(n_estimators = 50)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred
But then I get this error. Please help
TypeError Traceback (most recent call last)
<ipython-input-17-18d34976fcb7> in <module>
16 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
17 classifier = RandomForestClassifier(n_estimators = 50)
---> 18 classifier.fit(X_train, y_train)
19 y_pred = classifier.predict(X_test)
20 y_pred
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in fit(self, X, y, sample_weight)
301 "sparse multilabel-indicator for y is not supported."
302 )
--> 303 X, y = self._validate_data(X, y, multi_output=True,
304 accept_sparse="csc", dtype=DTYPE)
305 if sample_weight is not None:
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
430 y = check_array(y, **check_y_params)
431 else:
--> 432 X, y = check_X_y(X, y, **check_params)
433 out = X, y
434
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
794 raise ValueError("y cannot be None")
795
--> 796 X = check_array(X, accept_sparse=accept_sparse,
797 accept_large_sparse=accept_large_sparse,
798 dtype=dtype, order=order, copy=copy,
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)enter code here
532
533 if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
--> 534 dtype_orig = np.result_type(*dtypes_orig)
535
536 if dtype_numeric:
<__array_function__ internals> in result_type(*args, **kwargs)
TypeError: invalid type promotion

There is a column named DATE OCC in your dataset which has date and time mentioned in it. You are getting that error because, your model is expecting float values and that DATE OCC column you have is in object or datetime64 format. So you have to add this code that I mentioned below:
myData['DATE OCC'] = myData['DATE OCC'].astype('datetime64')
myData['day'] = myData['DATE OCC'].dt.day
myData['month'] = myData['DATE OCC'].dt.month
myData['Year'] = myData['DATE OCC'].dt.year
del myData['DATE OCC']
myData = pd.get_dummies(myData)
X= myData.drop(columns=['AREA NAME']) #input data
y= myData['AREA NAME'] #output data

Related

Beginner - Naive Bayes Classification runs into Error - Record Problem?

for our project work in university we need to write a machine learning code. Unfortunately I don't have any programming knowledge and am a bit helpless.
My classification is Naive Bayes and when I run the code I get an error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Unfortunately, I can't do anything with this and also nothing with the solutions found so far in the forum. Maybe someone can help me?
dat = pd.get_dummies(df)
# Define X and y
X = dat.drop('RESP', axis = 1)
y = dat['RESP']
# training and testing data
from sklearn.model_selection import train_test_split
# assign test data size 30%
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size= 0.3, random_state=0)
#BERNOULLI
# importing classifier
from sklearn.naive_bayes import BernoulliNB
# initializaing the NB
classifer = BernoulliNB()
# training the model
classifer.fit(X_train, y_train)
# testing the model
y_pred = classifer.predict(X_test)
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-22-df3c037d90d0> in <module>
23
24 # testing the model
---> 25 y_pred = classifer.predict(X_test)
5 frames
/usr/local/lib/python3.8/dist-packages/sklearn/naive_bayes.py in predict(self, X)
80 """
81 check_is_fitted(self)
---> 82 X = self._check_X(X)
83 jll = self._joint_log_likelihood(X)
84 return self.classes_[np.argmax(jll, axis=1)]
/usr/local/lib/python3.8/dist-packages/sklearn/naive_bayes.py in _check_X(self, X)
1145 def _check_X(self, X):
1146 """Validate X, used only in predict* methods."""
-> 1147 X = super()._check_X(X)
1148 if self.binarize is not None:
1149 X = binarize(X, threshold=self.binarize)
/usr/local/lib/python3.8/dist-packages/sklearn/naive_bayes.py in _check_X(self, X)
517 def _check_X(self, X):
518 """Validate X, used only in predict* methods."""
--> 519 return self._validate_data(X, accept_sparse="csr", reset=False)
520
521 def _check_X_y(self, X, y, reset=True):
/usr/local/lib/python3.8/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
798
799 if force_all_finite:
--> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
801
802 if ensure_min_samples > 0:
/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
112 ):
113 type_err = "infinity" if allow_nan else "NaN, infinity"
--> 114 raise ValueError(
115 msg_err.format(
116 type_err, msg_dtype if msg_dtype is not None else X.dtype
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead

I am trying to solve an nlp problem where i am using pytorch.here i am using sklearn's StratifiedKFold module for cross_validation, in which got a runtime error during looping through folds.
error message
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-43-f1ccf940359e> in <module>
----> 1 oof_train(ds, cv, y, epochs = EPOCHES)
<ipython-input-42-eaeaa91076bd> in oof_train(ds, cv, y, epochs)
14 loss_fn = torch.nn.MSELoss()
15
---> 16 for fold, (train_idx, valid_idx) in enumerate(cv.split(range(len(ds)), y)):
17
18 train_ds = D.Subset(ds, train_idx)
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_split.py in split(self, X, y, groups)
729 to an integer.
730 """
--> 731 y = check_array(y, ensure_2d=False, dtype=None)
732 return super().split(X, y, groups)
733
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
614 array = array.astype(dtype, casting="unsafe", copy=False)
615 else:
--> 616 array = np.asarray(array, order=order, dtype=dtype)
617 except ComplexWarning as complex_warning:
618 raise ValueError("Complex data not supported\n"
/opt/conda/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
/opt/conda/lib/python3.7/site-packages/torch/tensor.py in __array__(self, dtype)
628 return handle_torch_function(Tensor.__array__, relevant_args, self, dtype=dtype)
629 if dtype is None:
--> 630 return self.numpy()
631 else:
632 return self.numpy().astype(dtype, copy=False)
RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.
piece of code where it showing errors
def oof_train(ds, cv, y, epochs = EPOCHES):
loss_fn = torch.nn.MSELoss()
for fold, (train_idx, valid_idx) in enumerate(cv.split(range(len(ds)), y)):
train_ds = D.Subset(ds, train_idx)
loader = D.DataLoader(train_ds, batch_size=BATCH_SIZE,
shuffle=True, collate_fn = collate_fn,num_workers=0)
valid_ds = D.Subset(ds, valid_idx)
vloader = D.DataLoader(valid_ds, batch_size=BATCH_SIZE,
shuffle=False, collate_fn = collate_fn,num_workers=0)
code where i initialize StratifiedKFold
df_size = train_csv.shape[0]
num_bins = int(np.floor(1 + np.log2(df_size)))
# bin targets
y = pd.cut(train_csv["target"], bins=num_bins, labels=False)
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
code where i create dataset:
class LitDataset(D.Dataset):
def __init__(self,token,meta,target):
self.token = token
self.meta = meta
self.target = target
def __len__(self):
return len(self.token)
def __getitem__(self,idx):
return torch.tensor(self.token[idx].input_ids),torch.tensor(self.token[idx].attention_mask),\
self.meta[idx],self.target[idx]
def collate_fn(batch):
ids,attns,meta,targets = zip(*batch)
ids = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
attns = pad_sequence(attns, batch_first=True,padding_value=tokenizer.pad_token_id).to(DEVICE)
meta =torch.tensor(meta).to(DEVICE)
targets = torch.tensor(targets).float().to(DEVICE)
return ids,attns,meta,targets
ds = LitDataset(train_csv_stat.token.values,train_meta_features,train_y.values)

Resolving Value Error: Decision Tree Regressor

I'm getting a value error on my python code, because part of the data has a string and float. Im using Kaggle Imdb data, which has both floats and strings. I need help with changing the code so the string is allowed.
Not sure how to change the string to float in order to resolve the error.
enter code here
f
rom sklearn.tree import DecisionTreeRegressor
#setting the imdb features import in the data
imdb_features=['original_title','genre','director','actors','avg_vote','reviews_from_users','reviews_from_critics']
X= imdb_moviedata[imdb_features]
#Building Model
imdb_moviedata_model = DecisionTreeRegressor(random_state=1)
#Fit the model
imdb_moviedata_model.fit(X,y)
Receiving the following Error code:
------------------------------------------------------------------------`enter code here`
ValueError Traceback (most recent call last)
<ipython-input-21-61e73a0f75b7> in <module>
3 imdb_moviedata_model = DecisionTreeRegressor(random_state=1)
4 #Fit the model
----> 5 float(imdb_moviedata_model.fit(X,y))
/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
1240 """
1241
-> 1242 super().fit(
1243 X, y,
1244 sample_weight=sample_weight,
/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
154 check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
155 check_y_params = dict(ensure_2d=False, dtype=None)
--> 156 X, y = self._validate_data(X, y,
157 validate_separately=(check_X_params,
158 check_y_params))
/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
427 # :(
428 check_X_params, check_y_params = validate_separately
--> 429 X = check_array(X, **check_X_params)
430 y = check_array(y, **check_y_params)
431 else:
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
596 array = array.astype(dtype, casting="unsafe", copy=False)
597 else:
--> 598 array = np.asarray(array, order=order, dtype=dtype)
599 except ComplexWarning:
600 raise ValueError("Complex data not supported\n"
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in __array__(self, dtype)
1779
1780 def __array__(self, dtype=None) -> np.ndarray:
-> 1781 return np.asarray(self._values, dtype=dtype)
1782
1783 def __array_wrap__(self, result, context=None):
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: 'Miss Jerry'

How to pass multiple text columns to Logistic Regression for multi-label classifcation

I am attempting to assign binary values to 10 labels using 3 features, a headline of an article, a summary of the article, and an id of who created the labels. I'm stuck on trying create a model that can accept all 3 fields as input. Currently, it only works if I only pass just one field. I know I am likely messing something up with the tfidvectorizer, but I can't quite figure it out. Any help would be appreciated. The error I receive (full traceback below) is usually
ValueError: Found input variables with inconsistent numbers of samples: [3, 75897].
screenshot of dataframe
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import string
df = pd.read_csv('../data/homework_clean.csv')
emotion_cols = ['emotion_0', 'emotion_1', 'emotion_2', 'emotion_3', 'emotion_4', 'emotion_5', 'emotion_6', 'emotion_7', 'emotion_8', 'emotion_9']
def removeStopWords(sentence):
global re_stop_words
return re_stop_words.sub(" ", sentence)
def stemming(sentence):
stemSentence = ""
for word in sentence.split():
stem = stemmer.stem(word)
stemSentence += stem
stemSentence += " "
stemSentence = stemSentence.strip()
return stemSentence
df['headline'] = df['headline'].str.lower()
df['headline'] = df['headline'].str.replace(r'[^\w\s]+', '')
df['summary'] = df['summary'].str.lower()
df['summary'] = df['summary'].str.replace(r'[^\w\s]+', '')
stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
df['headline'] = df['headline'].apply(removeStopWords)
df['summary'] = df['summary'].apply(removeStopWords)
stemmer = SnowballStemmer('english')
df['headline'] = df['headline'].apply(stemming)
df['summary'] = df['summary'].apply(stemming)
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state = 42, test_size = .2, shuffle = True)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
vectorizer = FeatureUnion([
('headline', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2')),
('summary', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2'))])
x_train = train[['headline', 'summary', 'worker_id']]
y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
x_test = test[['headline', 'summary', 'worker_id']]
y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
# IF I only use one feature it works fine.
# x_train = train['headline']
# y_train = train.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
# x_test = test['headline']
# y_test = test.drop(labels = ['headline', 'summary', 'worker_id'], axis=1)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.multiclass import OneVsRestClassifier
OneVsRest_pipeline = Pipeline(steps = [
('featureunion', vectorizer),
('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
OneVsRest_pipeline.fit(x_train, y_train)
predictions = OneVsRest_pipeline.predict(x_test)
prediction_prob = OneVsRest_pipeline.predict_proba(x_test)
Full Traceback
ValueError Traceback (most recent call last)
<ipython-input-27-6394288c65f8> in <module>
4 ])
5
----> 6 OneVsRest_pipeline.fit(x_train, y_train)
7 predictions = OneVsRest_pipeline.predict(x_test)
8 prediction_prob = OneVsRest_pipeline.predict_proba(x_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
354 self._log_message(len(self.steps) - 1)):
355 if self._final_estimator != 'passthrough':
--> 356 self._final_estimator.fit(Xt, y, **fit_params)
357 return self
358
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in fit(self, X, y)
214 "not %s" % self.label_binarizer_.classes_[i],
215 self.label_binarizer_.classes_[i]])
--> 216 for i, column in enumerate(columns))
217
218 return self
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py in _fit_binary(estimator, X, y, classes)
78 else:
79 estimator = clone(estimator)
---> 80 estimator.fit(X, y)
81 return estimator
82
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1530
1531 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-> 1532 accept_large_sparse=solver != 'liblinear')
1533 check_classification_targets(y)
1534 self.classes_ = np.unique(y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
727 y = y.astype(np.float64)
728
--> 729 check_consistent_length(X, y)
730
731 return X, y
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
203 if len(uniques) > 1:
204 raise ValueError("Found input variables with inconsistent numbers of"
--> 205 " samples: %r" % [int(l) for l in lengths])
206
207

Sklearn tree classification on catogorical data

im trying to create a simple classification with tree classifier for disease symptoms. i have tried it using sklearn tree classifier.
it gives the following error. both my code and error is there.
Any suggestion ?
import numpy as np
from sklearn import tree
symptoms = [['flat face','poor moro','hypotonia'],['small head','small jaw','overlapping fingers'], ['small eyes','cleft lip','cleft palate']]
lables = [['Trisomy 21'],['Trisomy 18'],['Trisomy 13']]
classify = tree.DecisionTreeClassifier()
classify = classify.fit(symptoms, lables)
it gives the following error
ValueError Traceback (most recent call last)
<ipython-input-25-0f2c956618c2> in <module>
4 lables = [['Trisomy 21'],['Trisomy 18'],['Trisomy 13']]
5 classify = tree.DecisionTreeClassifier()
----> 6 classify = classify.fit(symptoms, lables)
c:\users\admin\appdata\local\programs\python\python36\lib\site-packages\sklearn\tree\tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
799 sample_weight=sample_weight,
800 check_input=check_input,
--> 801 X_idx_sorted=X_idx_sorted)
802 return self
803
c:\users\admin\appdata\local\programs\python\python36\lib\site-packages\sklearn\tree\tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
114 random_state = check_random_state(self.random_state)
115 if check_input:
--> 116 X = check_array(X, dtype=DTYPE, accept_sparse="csc")
117 y = check_array(y, ensure_2d=False, dtype=None)
118 if issparse(X):
c:\users\admin\appdata\local\programs\python\python36\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
525 try:
526 warnings.simplefilter('error', ComplexWarning)
--> 527 array = np.asarray(array, dtype=dtype, order=order)
528 except ComplexWarning:
529 raise ValueError("Complex data not supported\n"
c:\users\admin\appdata\local\programs\python\python36\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
499
500 """
--> 501 return array(a, dtype, copy=False, order=order)
502
503
ValueError: could not convert string to float: 'flat face'
You need to use label encoder for encoding your string values. The following will work for your requirement:
import numpy as np
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
symptoms = [['flat face','poor moro','hypotonia'],['small head','small jaw','overlapping fingers'], ['small eyes','cleft lip','cleft palate']]
lables = [['Trisomy 21'],['Trisomy 18'],['Trisomy 13']]
df = pd.concat([pd.DataFrame(symptoms), pd.DataFrame(lables)], axis=1)
x_cols = ['sym1', 'sym2', 'sym3']
y_col = 'target'
df.columns = x_cols + [y_col]
df = df.apply(LabelEncoder().fit_transform)
classify = tree.DecisionTreeClassifier()
classify.fit(df[x_cols].values, df[y_col].values)

Categories

Resources