SkLearn DecisionTree Doesn't Include Numerical Features After Fitting - python

I'm trying to fit a dataframe with SkLearn DecisionTree with the following code. But I get a error Length of feature_names, 9 does not match number of features, 8. The DecisionTree seems to have only fitted categorical features after transformed by onehotencoding, not the numerical feature. How can I include the numerical feature in the decisiontree model?
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from matplotlib import pyplot as plt
import graphviz
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','as'],
'num1' : [1, 1, 0, 0] ,
'target' : [1,0,0,1]})
df
dtarget=df['target']
dfeatures=df.drop('target', axis=1)
num = dfeatures.select_dtypes(include=["int64"]).columns.tolist()
cat = dfeatures.select_dtypes(include=["object"]).columns.tolist()
transformer = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(), cat),
]
)
clf= DecisionTreeClassifier(criterion="entropy", max_depth = 5)
pipe = Pipeline(steps=[
('onehotenc', transformer),
('decisiontree', clf)
])
#Fit the training data to the pipeline
pipe.fit(dfeatures, dtarget)
pipe.named_steps['onehotenc'].get_feature_names_out().tolist(),
dot_data= tree.export_graphviz(clf,
out_file=None,
feature_names = num + pipe.named_steps['onehotenc'].get_feature_names_out().tolist(),
class_names= ['1', '0'],
filled = True)

The numeric feature isn't in your transformer. Since you don't want to do any changes to it, try letting it pass through. You can explicitly define passthrough columns, or pass the remainder. remainder is fine if you know that's the only other column that could ever be sent to the model.
transformer = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(), cat),
],remainder='passthrough'
)
With this you will see your features names include the num1 column
pipe.named_steps['onehotenc'].get_feature_names_out().tolist()
Output
['cat__brand_NaN',
'cat__brand_aaaa',
'cat__brand_asdfasdf',
'cat__brand_sadfds',
'cat__category_as',
'cat__category_asdf',
'cat__category_asdfas',
'cat__category_asfa',
'remainder__num1']

Related

Extract feature names after Pipeline usage with ColumnTransformer (sklearn)

I have the following toy code.
I use a pipeline to automatically normalize numerical variables and apply one-hot-encoding to the categorical ones.
I can get the coefficients of the linear regression model easily using pipe['logisticregression'].coef_ but how can I get all the feature names in the right order as this appearing in the coef matrix?
from sklearn.compose import ColumnTransformer
import numpy as np, pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# data from https://www.kaggle.com/datasets/uciml/adult-census-income
data = pd.read_csv("adult.csv")
data = data.iloc[0:3000,:]
target = "workclass"
y = data[target]
X = data.drop(columns=target)
numerical_columns_selector = make_column_selector(dtype_exclude=object)
categorical_columns_selector = make_column_selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)
ct = ColumnTransformer([ ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns) ,
('std', StandardScaler(), numerical_columns)])
model = LogisticRegression(max_iter=500)
pipe = make_pipeline(ct, model)
data_train, data_test, target_train, target_test = train_test_split(
X, y, random_state=42)
pipe.fit(data_train, target_train)
pipe['logisticregression'].coef_.shape

sklearn preprocessor error - _determine_key_type is not defined

I am using a simple dataset off of the Datacamp site - multiple columns, some categorical, some numeric, looking to predict clicks on a website.
No problems with specifying the cat and num columns to be processed, but when I attempt to use the preprocessor.fit_transform function I get:
NameError: name '_determine_key_type' is not defined.
Why is this happening and how to fix it?
Code below:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.impute import SimpleImputer
numeric_features = [
"search_engine_type_count",
"product_type_count",
"advertiser_type_count",]
categorical_features = [
"banner_pos",
"device_type",
"device_conn_type",
"product_type",
"advertiser_type",]
numeric_transformer = Pipeline(
steps=[
("boxcox", PowerTransformer(method="box-cox", standardize=False)),
("scaler", StandardScaler()),])
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),])
processed_features = pd.DataFrame(
preprocessor.fit_transform(web_browser_df), columns=preprocessor.get_feature_names_out())
NameError: name '_determine_key_type' is not defined

Don't know why this comming?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df=pd.read_csv("car-sales-extended.csv")
df.head()
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn. impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
np.random.seed(42)
df.dropna(subset=["Price"], inplace=True)
categorical_features=["Make", "Colour"]
categorical_transformer=Pipeline(steps=[
("imputer",SimpleImputer(strategy="constant", fill_value="missing")),
("Onehot", OneHotEncoder(handle_unknown="ignore"))])
door_feature=["Doors"]
door_transformer=Pipeline(steps=[
("imputer",SimpleImputer(strategy="constant", fill_value=4)),
])
numeric_features=["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="mean")),
("scaler", StandardScaler())])
# Setup preprocessing steps (fill the missing values, then convert to numbers)
preprocessor = ColumnTransformer(
transformers=[(
"cat", categorical_transformer, categorical_features),
("door", door_transformer, door_feature),
("num", numeric_features, numeric_transformer)])
#creating a preprocessing and modelling pipeline
model=Pipeline(steps=[("preprocessing", preprocessor),
("model", RandomForestClassifier())])
# Split data
x=df.drop("Price", axis=1)
y=df["Price"]
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2)
# Fit and score the model
model.fit(x_train, y_train)
model.score(x_test, y_test)
Why this Type error is coming i don't know??
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '['Odometer (KM)']' (type <class 'list'>) doesn't.
You wrongly swapped parameters in:
preprocessor = ColumnTransformer(
transformers=[(
"cat", categorical_transformer, categorical_features),
("door", door_transformer, door_feature),
("num", numeric_features, numeric_transformer)])
It should be:
preprocessor = ColumnTransformer(
transformers=[(
"cat", categorical_transformer, categorical_features),
("door", door_transformer, door_feature),
("num",numeric_transformer, numeric_features )])

Scikit-learn SequentialFeatureSelector Input contains NaN, infinity or a value too large for dtype('float64'). even with pipeline

I'm trying to use SequentialFeatureSelector and for estimator parameter I'm passing it a pipeline that includes a step that inputes the missing values:
model = Pipeline(steps=[('preprocessing',
ColumnTransformer(transformers=[('pipeline-1',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value=-1,
strategy='constant')),
('preprocessing',
StandardScaler())]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300013d0>),
('pipeline-2',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value='missing',
strategy='constant')),
('encoding',
OrdinalEncoder(handle_unknown='ignore'))]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300015b0>)])),
('model',
LGBMClassifier(class_weight='balanced', random_state=1,
reg_lambda=0.1))])
Nonetheless when passing this to selector it shows an error, what does not make any sense since I have already fit and evaluated my model and it runs ok
fselector = SequentialFeatureSelector(estimator = model, scoring= "roc_auc", cv = 3, n_jobs= -1, ).fit(X, target)
_assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
EDIT:
Reproducible example:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),("model",LogisticRegression(random_state = 1))])
SequentialFeatureSelector(estimator = clf,
scoring= "accuracy",
cv = 3).fit(X, y)
It shows the same error, in spite of the clf can be fit without problems
ScikitLearn's documentation does not state that the SequentialFeatureSelector works with pipeline objects. It only states that the class accepts an unfitted estimator. In view of this, you could remove the classifier from your pipeline, preprocess X, and then pass it along with an unfitted classifier for feature selection as shown in the example below.
import numpy as np
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
pipe = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),
('scaler', MaxAbsScaler())])
# Preprocess your data
X = pipe.fit_transform(X)
# Run the SequentialFeatureSelector
sfs = SequentialFeatureSelector(estimator = LogisticRegression(),
scoring= "accuracy",
cv = 3).fit(X, y)
# Check which features are important and transform X
sfs.get_support()
X = sfs.transform(X)
You can use SequentialFeatureSelection from mlxtend package
https://rasbt.github.io/mlxtend/
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([
("preprocessing", SimpleImputer(missing_values= np.NaN)),
("model",LogisticRegression(random_state = 1))
])
sfs = SequentialFeatureSelector(estimator = clf,
forward = True,
k_features = 'best',
scoring = "accuracy",
cv = 3, n_jobs=-1).fit(X, y)
sfs.k_feature_idx_
>>> (0, 1, 2, 3)

Chaining transformations in scikit pipeline

I'm using the scikit pipeline to create a preprocess on a dataset. I have a dataset with four variables: ['monetary', 'frequency1', 'frequency2', 'recency'] and I want to preprocess all but recency. To preprocess, I first want to get the log and then standardize. However, when I get the transformed data from the pipeline, I get 7 columns (3 log, 3 standardize, recency). Is there a way to chain the transformations and so I can get the log and after the log perform standardize and only get a 4 feature dataset?
def create_pipeline(df):
all_but_recency = ['monetary', 'frequency1','frequency2']
# Preprocess
preprocessor = ColumnTransformer(
transformers=[
( 'log', FunctionTransformer(np.log), all_but_recency ),
( 'standardize', preprocessing.StandardScaler(), all_but_recency ) ],
remainder='passthrough')
# Pipeline
estimators = [( 'preprocess', preprocessor )]
pipe = Pipeline(steps=estimators)
print(pipe.set_params().fit_transform(df).shape)
Thanks in advance
You have to apply the FunctionTransformer sequentially. Try this!
def create_pipeline(df):
all_but_recency = ['monetary', 'frequency1','frequency2']
# Preprocess
# Preprocess
preprocessor1 = ColumnTransformer([('log', FunctionTransformer(np.log), all_but_recency)],'passthrough')
preprocessor2 = ColumnTransformer([('standardize', preprocessing.StandardScaler(), all_but_recency)],'passthrough' )
# Pipeline
estimators = [('preprocess1', preprocessor1),('standardize', preprocessor2)]
pipe = Pipeline(steps=estimators)
print(pipe.set_params().fit_transform(df).shape)
working example
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
iris = load_iris()
X, y = iris.data, iris.target
df= pd.DataFrame(X,columns = iris.feature_names)
all_but_one = [0,1,2]
# Preprocess
preprocessor1 = ColumnTransformer([('log', FunctionTransformer(np.log), all_but_one)],'passthrough')
preprocessor2 = ColumnTransformer([('standardize', preprocessing.StandardScaler(), all_but_one)],'passthrough' )
# Pipeline
estimators = [('preprocess1', preprocessor1),('scalling', preprocessor2)]
pipe = Pipeline(steps=estimators,)
pipe.fit_transform(df)

Categories

Resources