python converting string values to numerical with fit_transform(data) - python

Hey I'm over this problem for 2 hours, can someone explain why I get this error?
I'm supposed to turn strings values (that presents 10,000 columbs of 3 states and a gender)
to numeric values and don't know what's the problem I saw someone on Udemy do it and it worked fine.
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
Error:
Input In [15], in <cell line: 1>()
----> 1 X_train = sc.fit_transform(X_train)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py:867, in TransformerMixin.fit_transform(self, X, y, **fit_params)
863 # non-optimized default implementation; override when a better
864 # method is possible for a given clustering algorithm
865 if y is None:
866 # fit method of arity 1 (unsupervised transformation)
--> 867 return self.fit(X, **fit_params).transform(X)
868 else:
869 # fit method of arity 2 (supervised transformation)
870 return self.fit(X, y, **fit_params).transform(X)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\preprocessing\_data.py:809, in StandardScaler.fit(self, X, y, sample_weight)
807 # Reset internal state before fitting
808 self._reset()
--> 809 return self.partial_fit(X, y, sample_weight)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\preprocessing\_data.py:844, in StandardScaler.partial_fit(self, X, y, sample_weight)
812 """Online computation of mean and std on X for later scaling.
813
814 All of X is processed as a single batch. This is intended for cases
(...)
841 Fitted scaler.
842 """
843 first_call = not hasattr(self, "n_samples_seen_")
--> 844 X = self._validate_data(
845 X,
846 accept_sparse=("csr", "csc"),
847 dtype=FLOAT_DTYPES,
848 force_all_finite="allow-nan",
849 reset=first_call,
850 )
851 n_features = X.shape[1]
853 if sample_weight is not None:
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py:577, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
575 raise ValueError("Validation should be done on X, y or both.")
576 elif not no_val_X and no_val_y:
--> 577 X = check_array(X, input_name="X", **check_params)
578 out = X
579 elif no_val_X and not no_val_y:
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\utils\validation.py:856, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
854 array = array.astype(dtype, casting="unsafe", copy=False)
855 else:
--> 856 array = np.asarray(array, order=order, dtype=dtype)
857 except ComplexWarning as complex_warning:
858 raise ValueError(
859 "Complex data not supported\n{}\n".format(array)
860 ) from complex_warning
ValueError: could not convert string to float: 'Spain'

You need to encode your string columns (categorical features) first. Use OrdinalEncoder(), LabelEncoder() or OneHotEncoder() to convert categorical columns to numeric. You can only scale numerical variables.

Ok I figured it out.
# Preform label encoding for gender variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lableencoder_X_2 = LabelEncoder()
X[:, 2] = lableencoder_X_2.fit_transform(X[:, 2])
# preform one ho encoding for geography varaible
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('ohe', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype = np.str)
X = X[:, 1:]

Related

I am trying to fit my data for Linear Regression and getting this error [duplicate]

This question already has answers here:
How do I remove NaN values from a NumPy array?
(13 answers)
Closed 7 days ago.
> this is the error i am getting, i am new to python please help with this.
##########################################################################
ValueError Traceback (most recent call last)
Cell In[97], line 4
1 LR= LinearRegression()
3 #fit
----> 4 LR.fit(X,Y)
6 #predict
7 y_predict = LR.predict(X_test)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_base.py:649, in LinearRegression.fit(self, X, y, sample_weight)
645 n_jobs_ = self.n_jobs
647 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 649 X, y = self._validate_data(
650 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
651 )
653 sample_weight = _check_sample_weight(
654 sample_weight, X, dtype=X.dtype, only_non_negative=True
655 )
657 X, y, X_offset, y_offset, X_scale = _preprocess_data(
658 X,
659 y,
(...)
662 sample_weight=sample_weight,
663 )
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:554, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
552 y = check_array(y, input_name="y", **check_y_params)
553 else:
--> 554 X, y = check_X_y(X, y, **check_params)
555 out = X, y
557 if not no_val_X and check_params.get("ensure_2d", True):
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:1104, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1099 estimator_name = _check_estimator_name(estimator)
1100 raise ValueError(
1101 f"{estimator_name} requires y to be passed, but the target y is None"
1102 )
-> 1104 X = check_array(
1105 X,
1106 accept_sparse=accept_sparse,
1107 accept_large_sparse=accept_large_sparse,
1108 dtype=dtype,
1109 order=order,
1110 copy=copy,
1111 force_all_finite=force_all_finite,
1112 ensure_2d=ensure_2d,
1113 allow_nd=allow_nd,
1114 ensure_min_samples=ensure_min_samples,
1115 ensure_min_features=ensure_min_features,
1116 estimator=estimator,
1117 input_name="X",
1118 )
1120 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1122 check_consistent_length(X, y)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:919, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
913 raise ValueError(
914 "Found array with dim %d. %s expected <= 2."
915 % (array.ndim, estimator_name)
916 )
918 if force_all_finite:
--> 919 _assert_all_finite(
920 array,
921 input_name=input_name,
922 estimator_name=estimator_name,
923 allow_nan=force_all_finite == "allow-nan",
924 )
926 if ensure_min_samples > 0:
927 n_samples = _num_samples(array)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:111, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
109 if X.dtype == np.dtype("object") and not allow_nan:
110 if _object_dtype_isnan(X).any():
--> 111 raise ValueError("Input contains NaN")
113 # We need only consider float arrays, hence can early return for all else.
114 if X.dtype.kind not in "fc":
ValueError: Input contains NaN
###################################################################333
This is the code i am trying and getting the error above.
LR= LinearRegression(normalize=True)
#fit
LR.fit(X,Y)
#predict
y_predict = LR.predict(X_test)
it looks like the x and y values youre passing to the fit function are not properly formatted, which is causing the issue. specifically it seems like one of the values in x might be null. im not familiar with the modules you used but checking to make sure the x and y are set correctly is a good first step.
Use train_test_split,
LinearRegression simple code:
X = dataset[['statezip','city','bedrooms','sqft_living','sqft_lot','sqft_above','floors',]]
y = dataset['price']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn import linear_model
import numpy as np
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
y_pred_RMSE = regr.predict(X_test)

Support vector regression: ValueError: Found array with dim 3. StandardScaler expected <= 2

I'm trying to perform a supporting vector regression on my datasets, however I have run into a problem when trying to standard scale my datasets.
Originally I was facing an issue of "ValueError: Expected 2D array, got 1D array instead".
I tried to change the shape of my datasets, but I'm now facing an issue ValueError: Found array with dim 3. StandardScaler expected <= 2.
Could anyone please help me out how can I proceed to eliminate this issue?
Below is what I have done:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset=pd.read_excel(r'C:\Users\Sammy\OneDrive - International Campus, Zhejiang University\Desktop\Data\BAYC Data.xlsx')
print(dataset)
dataset.columns
x=dataset.iloc[:,1].values
y=dataset.iloc[:,2].values
x=dataset.iloc[:,0].to_frame()
x=np.expand_dims(x, axis = -1)
from sklearn.preprocessing import StandardScaler
st_x=StandardScaler()
st_y=StandardScaler()
X=st_x.fit_transform(x)
Y=st_y.fit_transform(y)
And this is the error I'm receiving:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_11140\1840045772.py in <module>
----> 1 X=st_x.fit_transform(x)
2 Y=st_y.fit_transform(y)
~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
804 # Reset internal state before fitting
805 self._reset()
--> 806 return self.partial_fit(X, y, sample_weight)
807
808 def partial_fit(self, X, y=None, sample_weight=None):
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
839 """
840 first_call = not hasattr(self, "n_samples_seen_")
--> 841 X = self._validate_data(
842 X,
843 accept_sparse=("csr", "csc"),
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
792 ) from e
793 if not allow_nd and array.ndim >= 3:
--> 794 raise ValueError(
795 "Found array with dim %d. %s expected <= 2."
796 % (array.ndim, estimator_name)
ValueError: Found array with dim 3. StandardScaler expected <= 2.
you are trying to "push" 3d array to the function that need 2d array.
I would recommend delete rows 9 and 10, and in rows 7 and 8 edit the values in [], so the dataset.iloc will give you 2d array :)

Sklearn Naive Bayes with multiple features

Background
I'm struggling to implement a Naive Bayes classifier in python with sklearn across multiple features.
The features I have are:
Title - some short text
Description - some longer text
Timestamp - a float representing an hour of the day (e.g. 18.0 = 6:00PM, 11.5 = 11:30AM)
The labels/classes are categorical strings: e.g. "Class1", "Class2", "Class3"
Aim
My goal is to use the 3 features in order to construct a Naive Bayes classifier for 3 features in order to predict the class label. I specifically wish to use all of the features at the same time, i.e. not simply the description feature.
Initial Approach
I have setup some pre-processing pipelines using sklearn as follows:
from sklearn import preprocessing, naive_bayes, feature_extraction, pipeline, model_selection, compose,
text_columns = ['title', 'description']
time_columns = ['timestamp']
# get an 80-20 test-train split
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
# convert the text data into vectors
text_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
# preprocess by scaling the data, and binning the data
time_pipeline = pipeline.Pipeline([
('scaler', preprocessing.StandardScaler()),
('bin', preprocessing.KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')),
])
# combine the pre-processors
preprocessor = compose.ColumnTransformer([
('text', text_pipeline, text_columns),
('time', time_pipeline, time_columns),
])
clf = pipeline.Pipeline([
('preprocessor', preprocessor),
('clf', naive_bayes.MultinomialNB()),
])
Here train is a pandas dataframe with the features and labels, read straight from a .csv file like this:
ID,title,description,timestamp,class
1,First Title String,"A description of the first title",13.0,Class1
2,Second Title String,"A description of the second title",17.5,Class2
Also note that I'm not setting most of the params for the transformers/classifiers, as I want to use a grid-search to find the optimum ones later on.
The problem
When I call clf.fit(X_train, y_train), I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/3039541201.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
~/.local/lib/python3.9/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
697 self._record_output_indices(Xs)
698
--> 699 return self._hstack(list(Xs))
700
701 def transform(self, X):
~/.local/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in _hstack(self, Xs)
789 else:
790 Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
--> 791 return np.hstack(Xs)
792
793 def _sk_visual_block_(self):
<__array_function__ internals> in hstack(*args, **kwargs)
~/.local/lib/python3.9/site-packages/numpy/core/shape_base.py in hstack(tup)
344 return _nx.concatenate(arrs, 0)
345 else:
--> 346 return _nx.concatenate(arrs, 1)
347
348
<__array_function__ internals> in concatenate(*args, **kwargs)
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 2 and the array at index 1 has size 3001
I have the following shapes for X_train and y_train:
X_train: (3001, 3)
y_train: (3001,)
Steps Taken
Individual Features
I can use the same pipelines with individual features (by altering the text_features and time_features arrays), and get a perfectly fine classifier. E.g. only using the "title" field, or only using the "timestamp". Unfortunately, these individual features are not accurate enough, so I would like to use all the features to build a more accurate classifier. The issue seems to be when I attempt to combine more than one feature.
I'm open to potentially using multiple Naive Bayes classifiers, and trying to multiply the probabilities together to get some overall probability, but I honestly have no clue how to do that, and I'm sure I'm just missing something simple here.
Dropping the Time Features
I have tried running only the text_features, i.e. "title" and "description", and I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7500/1900884535.py in <module>
33
34 # x = pd.DataFrame(text_pipeline.fit_transform(X_train['mean_checkin_time']))
---> 35 x = clf.fit(X_train, y_train)
36 # # print the number of features
37
~/.local/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
395
396 return self
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in fit(self, X, y, sample_weight)
661 Returns the instance itself.
662 """
--> 663 X, y = self._check_X_y(X, y)
664 _, n_features = X.shape
665
~/.local/lib/python3.9/site-packages/sklearn/naive_bayes.py in _check_X_y(self, X, y, reset)
521 def _check_X_y(self, X, y, reset=True):
522 """Validate X and y in fit methods."""
--> 523 return self._validate_data(X, y, accept_sparse="csr", reset=reset)
524
525 def _update_class_log_prior(self, class_prior=None):
~/.local/lib/python3.9/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
583
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
980
--> 981 check_consistent_length(X, y)
982
983 return X, y
~/.local/lib/python3.9/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
330 uniques = np.unique(lengths)
331 if len(uniques) > 1:
--> 332 raise ValueError(
333 "Found input variables with inconsistent numbers of samples: %r"
334 % [int(l) for l in lengths]
ValueError: Found input variables with inconsistent numbers of samples: [2, 3001]
And I have the following shapes:
X_train: (3001, 2)
y_train: (3001,)
Reshaping the Labels
I have also tried reshaping y_train variable by calling it wrapped in [] like so:
# new
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train[['class']], test_size=0.2, random_state=RANDOM_STATE)
# previous
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[text_columns + time_columns], train['class'], test_size=0.2, random_state=RANDOM_STATE)
so that the resultant shapes are:
X_train: (3001, 3)
y_train: (3001, 1)
But unfortunately this doesn't appear to fix this.
Removing Naive Bayes Classifier
When I remove the final step of the pipeline (the naivebayes.MultinomialNB()), and I remove the text_features ("timestamp" feature), then I can build a pre-processor that works just fine for the text. I.e. I can pre-process the text fields ("title", "description"), but when I add the classifier, I get the error above under "Dropping the Time Features".
When vectorizing multiple text features, you should create CountVectorizer (or TfidfVectorizer) instances for every feature:
title_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
description_pipeline = pipeline.Pipeline([
('vect', feature_extraction.text.CountVectorizer()),
('tfidf', feature_extraction.text.TfidfTransformer()),
])
preprocessor = compose.ColumnTransformer([
('title', title_pipeline, text_columns[0]),
('description', description_pipeline, text_columns[1]),
('time', time_pipeline, time_columns),
])
P.S. The combination of CountVectorizer and TfidfTransformer is equivalent to TfidfVectorizer. Also, you may just skip tf-idf weighting and use only CountVectorizer for MultinomialNB.

AttributeError while implementing FAMD with SMOTENC in a imblearn pipeline

I'm trying to implement a pipeline with FAMD, SMOTENC, and other preprocessing steps. However it gives error each time. If i remove FAMD from the pipeline it works fine.
My code:
#Seperate the dataset in two parts
num_df= X_train_new.select_dtypes(include=[np.number]).columns
cat_df= X_train_new.select_dtypes(exclude=[np.number]).columns
#Create a mask for categorical features
categorical_feature_mask = X_train_new.dtypes == object
print(categorical_feature_mask)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
#Create a pipeline to automate the preprocessing steps and SMOTENC together
num_pipe = make_pipeline(SimpleImputer(strategy='median'))
cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore'))
transformer= make_column_transformer((num_pipe, selector(dtype_include='number')),
(cat_pipe, selector(dtype_include='object')),n_jobs=2)
#Undersampling with SMOTENC
from imblearn.over_sampling import SMOTENC
smote= SMOTENC(categorical_features=categorical_feature_mask,random_state=99)
!pip install prince
from prince import FAMD
famd=FAMD(n_components=4,random_state=99)
from imblearn.pipeline import make_pipeline as imb_pipeline
#Fit the random forest learner
rf=RandomForestClassifier(n_estimators=300random_state=99)
pipe=imb_pipeline(transformer,smote,famd,rf)
pipe.fit(X_train_new,y_train_new)
print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
The error:
AttributeError Traceback (most recent call last)
<ipython-input-24-2b7ea084a318> in <module>()
3 rf=RandomForestClassifier(n_estimators=300,max_features=3,criterion='entropy',random_state=99)
4 pipe=imb_pipeline(transformer,smote,famd,rf)
----> 5 pipe.fit(X_train_new,y_train_new)
6 print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
6 frames
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
235
236 """
--> 237 Xt, yt, fit_params = self._fit(X, y, **fit_params)
238 if self._final_estimator is not None:
239 self._final_estimator.fit(Xt, yt, **fit_params)
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit(self, X, y, **fit_params)
195 Xt, fitted_transformer = fit_transform_one_cached(
196 cloned_transformer, None, Xt, yt,
--> 197 **fit_params_steps[name])
198 elif hasattr(cloned_transformer, "fit_resample"):
199 Xt, yt, fitted_transformer = fit_resample_one_cached(
/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
564 def _fit_transform_one(transformer, weight, X, y, **fit_params):
565 if hasattr(transformer, 'fit_transform'):
--> 566 res = transformer.fit_transform(X, y, **fit_params)
567 else:
568 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
572 else:
573 # fit method of arity 2 (supervised transformation)
--> 574 return self.fit(X, y, **fit_params).transform(X)
575
576
/usr/local/lib/python3.7/dist-packages/prince/famd.py in fit(self, X, y)
27
28 # Separate numerical columns from categorical columns
---> 29 num_cols = X.select_dtypes(np.number).columns.tolist()
30 cat_cols = list(set(X.columns) - set(num_cols))
31
/usr/local/lib/python3.7/dist-packages/scipy/sparse/base.py in __getattr__(self, attr)
689 return self.getnnz()
690 else:
--> 691 raise AttributeError(attr + " not found")
692
693 def transpose(self, axes=None, copy=False):
AttributeError: select_dtypes not found
tl;dr: try adding sparse=False to your OneHotEncoder. Consider raising an Issue with prince, to handle sparse inputs.
You can see from the traceback that the problem is that FAMD.fit tries X.select_dtypes to separate categorical and numeric data. select_dtypes is a pandas function, so normally I would assume that prince is written to operate on dataframes and not the numpy arrays that sklearn uses internally (after converting from frames if necessary). However, looking at the source, a few lines above that one they do convert from numpy array to dataframe. But, the last trace message is from scipy. That hints that your X may actually be a sparse array. And indeed OneHotEncoder (earlier in your pipeline) prefers to output sparse arrays, and ColumnTransformer determines whether to transform into sparse or dense depending on its component parts and the parameter sparse_threshold.

Losing the header of a csv file after normlizing

I've wrote the following code to read a csv file run a column wise normalization :
from sklearn import preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# reading Train values
Training ='Training.csv'
df = pd.read_csv(Training)
df =df.drop(df.columns[len(df.loc[1])-1],axis=1)
df =df.drop(df.columns[len(df.loc[1])-1],axis=1)
df.describe()
minmax_scaler= preprocessing.MinMaxScaler()
np_scaled = minmax_scaler.fit_transform(df)
normalized = pd.DataFrame(np_scaled)
normalized.describe()
np.shape(df)
np.shape(normalized)
My question is why I can't see the headers in the normalized list? despite it having the same shape of df, I've tried to read the csv file without a header but the program crashes ?
..............................
df = pd.read_csv(Training,header=None)
.........................
delivers the following :
ValueError Traceback (most recent call last)
<ipython-input-15-dd18ba2a6204> in <module>()
14 df.describe()
15 minmax_scaler= preprocessing.MinMaxScaler()
---> 16 np_scaled = minmax_scaler.fit_transform(df)
17 normalized = pd.DataFrame(np_scaled)
18 normalized.describe()
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
492 if y is None:
493 # fit method of arity 1 (unsupervised transformation)
--> 494 return self.fit(X, **fit_params).transform(X)
495 else:
496 # fit method of arity 2 (supervised transformation)
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\preprocessing\data.py in fit(self, X, y)
290 # Reset internal state before fitting
291 self._reset()
--> 292 return self.partial_fit(X, y)
293
294 def partial_fit(self, X, y=None):
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\preprocessing\data.py in partial_fit(self, X, y)
316
317 X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True,
--> 318 estimator=self, dtype=FLOAT_DTYPES)
319
320 if X.ndim == 1:
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
ValueError: could not convert string to float: 'Feature458'
I'll greatful for any hint about how may I solve this !
Well, that is because you use preprocessing.MinMaxScaler() which returns an array, not a dataframe.
After you create a dataframe based on this matrix, it does not know anything about your columns.
You could try something like
normalized = pd.DataFrame(np_scaled, columns=df.columns)
And with the latter example (with header=False), you simply have your header as the first row. When sklearn tries to convert a column name into an integer, you get your error.

Categories

Resources