import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df=pd.read_csv("car-sales-extended.csv")
df.head()
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn. impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
np.random.seed(42)
df.dropna(subset=["Price"], inplace=True)
categorical_features=["Make", "Colour"]
categorical_transformer=Pipeline(steps=[
("imputer",SimpleImputer(strategy="constant", fill_value="missing")),
("Onehot", OneHotEncoder(handle_unknown="ignore"))])
door_feature=["Doors"]
door_transformer=Pipeline(steps=[
("imputer",SimpleImputer(strategy="constant", fill_value=4)),
])
numeric_features=["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="mean")),
("scaler", StandardScaler())])
# Setup preprocessing steps (fill the missing values, then convert to numbers)
preprocessor = ColumnTransformer(
transformers=[(
"cat", categorical_transformer, categorical_features),
("door", door_transformer, door_feature),
("num", numeric_features, numeric_transformer)])
#creating a preprocessing and modelling pipeline
model=Pipeline(steps=[("preprocessing", preprocessor),
("model", RandomForestClassifier())])
# Split data
x=df.drop("Price", axis=1)
y=df["Price"]
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2)
# Fit and score the model
model.fit(x_train, y_train)
model.score(x_test, y_test)
Why this Type error is coming i don't know??
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '['Odometer (KM)']' (type <class 'list'>) doesn't.
You wrongly swapped parameters in:
preprocessor = ColumnTransformer(
transformers=[(
"cat", categorical_transformer, categorical_features),
("door", door_transformer, door_feature),
("num", numeric_features, numeric_transformer)])
It should be:
preprocessor = ColumnTransformer(
transformers=[(
"cat", categorical_transformer, categorical_features),
("door", door_transformer, door_feature),
("num",numeric_transformer, numeric_features )])
Related
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
model = DecisionTreeRegressor()
my_pipeline = Pipeline(steps=[
("scale", StandardScaler),
("preprocessor", preprocessor),
("model", model)
])
my_pipeline.fit(X_train, y_train)
Raising the error AttributeError: 'DataFrame' object has no attribute 'fit'.
When I run the code without scaling it shows no error but with the scaling, it shows a weird (for me) error.
So my questions are:
How can I properly add a scale to my pipeline?
Why the variable my_pipeline is a pandas object?
Edit
#Creation of imputer to deal with missing value for numerical_cols
from sklearn.impute import SimpleImputer
numerical_transformer = SimpleImputer(strategy="mean")
#Creation of pipe to deal with categorical value and their missing value
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
categorical_cols = [cname for cname in X.columns if data[cname].nunique() < 10 and data[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if data[cname].dtypes != "object"]
numerical_cols.remove("PassengerId")
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy='most_frequent')),
("onehot", OneHotEncoder(handle_unknown='ignore'))
])
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
("num", numerical_transformer, numerical_cols),
("cat", categorical_transformer, categorical_cols)
])
The error is line 12 at my_pipeline.fit(X_train, y_train)
I am using a simple dataset off of the Datacamp site - multiple columns, some categorical, some numeric, looking to predict clicks on a website.
No problems with specifying the cat and num columns to be processed, but when I attempt to use the preprocessor.fit_transform function I get:
NameError: name '_determine_key_type' is not defined.
Why is this happening and how to fix it?
Code below:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.impute import SimpleImputer
numeric_features = [
"search_engine_type_count",
"product_type_count",
"advertiser_type_count",]
categorical_features = [
"banner_pos",
"device_type",
"device_conn_type",
"product_type",
"advertiser_type",]
numeric_transformer = Pipeline(
steps=[
("boxcox", PowerTransformer(method="box-cox", standardize=False)),
("scaler", StandardScaler()),])
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),])
processed_features = pd.DataFrame(
preprocessor.fit_transform(web_browser_df), columns=preprocessor.get_feature_names_out())
NameError: name '_determine_key_type' is not defined
I'm a beginner trying to learn sklearn pipeline. I get a value error of ValueError: could not convert string to float when I run my code below. I'm not sure what's the reason for it since OneHotEncoder shouldn't have any problem converting string to float for categorical variables
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)
x_cols = [c for c in df.columns if c!='income']
X = df[x_cols]
y = df['income']
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
preprocessor = ColumnTransformer(
transformers=[
('imputer', SimpleImputer(strategy='most_frequent'),['workclass','education','native-country']),
('onehot', OneHotEncoder(), ['workclass', 'education', 'marital-status',
'occupation', 'relationship', 'race', 'sex','native-country'])
]
)
clf = Pipeline([('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
clf.fit(X_train, y_train)
Unfortunately, there is an issue with scikit-learn's SimpleImputer when it tries to impute string variables. Here is a open issue about it on their github page.
To get around this, I'd recommend splitting up your pipeline into two steps. One for just the replacement of null values and 2) the rest, something like this:
cols_with_null = ['workclass','education','native-country']
preprocessor = ColumnTransformer(
transformers=[
(
'imputer',
SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
cols_with_null),
])
preprocessor.fit(X_train)
X_train_new = preprocessor.transform(X_train)
for icol, col in enumerate(cols_with_null):
X_train.loc[:, col] = X_train_new[:, icol]
# confirm no null values in these columns:
for col in cols_with_null:
print('{}, null values: {}'.format(col, pd.isnull(X_train[col]).sum()))
Now that you have X_train with no null values, the rest should work without SimpleImputer:
preprocessor = ColumnTransformer(
transformers=[
('onehot', OneHotEncoder(), ['workclass', 'education', 'marital-status',
'occupation', 'relationship', 'race', 'sex','native-country'])])
clf = Pipeline([('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
clf.fit(X_train, y_train)
I'm using sklearn pipelines to preprocess my data.
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler()),
('imputer', KNNImputer(n_neighbors=2,weights='uniform', metric='nan_euclidean', add_indicator=True))
])
categorical_transformer = Pipeline(steps=[
('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
from sklearn.compose import make_column_selector as selector
numeric_features = ['Latitud','Longitud','Habitaciones','Dormitorios','BaƱos','Superficie_Total','Superficie_cubierta']
categorical_features = ['Tipo_de_propiedad']
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('numeric', numeric_transformer, numeric_features, selector(dtype_exclude="category"))
,('categorical', categorical_transformer, categorical_features, selector(dtype_include="category"))])
The feature Tipo_de_propiedad has 3 classes: 'Departamento', 'Casa', 'PH'. So the 7 other features plus these dummies should give me 10 after transforming, but when I apply fit_transform, it returns 14 features.
train_transfor=pd.DataFrame(preprocessor.fit_transform(X_train))
train_transfor.head()
When I use pd.get_dummies it works well, but I can't use that to apply in the Pipeline; OneHotEncoder is useful because I can fit on train set and transform on the test set.
dummy=pd.get_dummies(df30[["Tipo_de_propiedad"]])
df_new=pd.concat([df30,dummy],axis=1)
df_new.head()
Your KNNImputer has used the parameter add_indicator=True, so the additional columns are presumably missingness indicators for some of your numeric columns.
Code:
num_features = [feature for feature in x.columns if x[feature].dtypes != 'O']
x[num_features].replace('N',value=0)
from sklearn.preprocessing import StandardScaler
stds = StandardScaler()
x[num_features]= stds.fit(x)
Error:
ValueError: could not convert string to float: 'N'
To select numeric columns:
numeric_features = df.select_dtypes(include=['numeric'])
You method right now will not exclude Booleans(True and False), Category values, or Date and Time.
Just having numeric values is not enough. We may also have missing values. We need to deal with them before normalisation:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# numeric transformer
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
# now before we transform we impute missing values with columns median.
normalise_numeric_features = numeric_transformer.fit_transform(numeric_feature)
This might not be enough, as you might have different data types in your dataset each needing their own transformer. You can do:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
# select features types
numeric_features = df.select_dtypes(include=['numeric'])
categorical_features = df.select_dtypes(include=['category'])
# create transformer for each feature types
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# create a process
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
# if you have a model you can add it in a pipeline to
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(solver='lbfgs'))])
# now we can use train the model
clf.fit(df[train_features], df[train_target])
# the fit will take the features, do the preprocessing(impute, normalise etc) that is fit_transforms, and train your model.
clf.predict(df[test_features])
# predict will just transform your features, using the learned metrics from train_features.
Hope this helps get the most of scikit-learn.