This question already has answers here:
Why do these list operations (methods: clear / extend / reverse / append / sort / remove) return None, rather than the resulting list?
(6 answers)
Apply multiple preprocessing steps to a column in sklearn pipeline
(1 answer)
ColumnTransformer & Pipeline with OHE - Is the OHE encoded field retained or removed after ct is performed?
(1 answer)
Closed 5 months ago.
I am trying to replicate my lambda function into my pipeline
def determine_healthy(_list):
if ('no' in _list['smoker'] and (_list['bmi'] >= 18.5) and (_list['bmi']<= 24.9)):
return True
else:
return False
df['healthy'] = df.apply(lambda row: determine_healthy(row), axis=1)
The problem comes when I am integrating it into my pipeline, I'm not sure if the issue is that there is an additional column 'healthy' that is being added. This error is thrown when I'm trying to transform my X_train
from sklearn.base import BaseEstimator, TransformerMixin
class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
def __init__(self, items=None):
if items is None: items = []
self.l = items
def fit(self, X , y=None):
return self
def transform(self, X):
#X = X.copy()
temp_cols = X.columns.to_list()
temp_cols = temp_cols.append('healthy')
new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
healthy = X.apply(lambda row: determine_healthy(row), axis=1)
combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
return combined_df
num_col = ['age','bmi']
cat_col = ['sex', 'smoker','region','children','healthy']
y = df.pop('charges')
X = df
all_col = X.columns
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state = 42)
transform_pipeline = ColumnTransformer([
('healthy', HealthyAttributeAdder(), all_col),
('ss', StandardScaler(), num_col),
('ohe', OneHotEncoder(drop='first'), cat_col),
])
price_pipeline = Pipeline([
('transform', transform_pipeline),
('lasso',Lasso())
])
health_transform = HealthyAttributeAdder()
health_transform.fit_transform(X_train)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19796/500623650.py in <module>
----> 1 health_transform.fit_transform(X_train)
~\Venv\hdbtest\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
~\AppData\Local\Temp/ipykernel_19796/3713134512.py in transform(self, X)
11 temp_cols = X.columns.to_list()
12 temp_cols = temp_cols.append('healthy')
---> 13 new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
14 healthy = X.apply(lambda row: determine_healthy(row), axis=1)
15 combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
TypeError: object of type 'NoneType' has no len()
Error when I use it to predict:
price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Venv\hdbtest\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
~\Venv\hdbtest\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
~\Venv\hdbtest\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'healthy'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\Venv\hdbtest\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
432 for col in columns:
--> 433 col_idx = all_columns.get_loc(col)
434 if not isinstance(col_idx, numbers.Integral):
~\Venv\hdbtest\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
KeyError: 'healthy'
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19796/993407432.py in <module>
----> 1 price_pipeline.fit(X_train,y_train)
2 y_pred = price_pipeline.predict(X_test)
~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
~\Venv\hdbtest\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~\Venv\hdbtest\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
670 self._check_n_features(X, reset=True)
671 self._validate_transformers()
--> 672 self._validate_column_callables(X)
673 self._validate_remainder(X)
674
~\Venv\hdbtest\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_column_callables(self, X)
350 columns = columns(X)
351 all_columns.append(columns)
--> 352 transformer_to_input_indices[name] = _get_column_indices(X, columns)
353
354 self._columns = all_columns
~\Venv\hdbtest\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
439
440 except KeyError as e:
--> 441 raise ValueError("A given column is not a column of the dataframe") from e
442
443 return column_indices
ValueError: A given column is not a column of the dataframe
The first issue is actually independent from the ColumnTransformer usage and it is due to a bug in method transform's implementation in your HealthyAttributeAdder class.
In order to get a consistent result you should modify line
temp_cols = temp_cols.append('healthy')
into
temp_cols.append('healthy')
Actually, the issue is the one described here.
On the other hand, when you switch to ColumnTransformer, the issue is the one described either here or here eg (you'll find other posts related, too). Namely, ColumnTransformer applies its transformers in parallel (to the X_train dataset you're passing); therefore, when it comes to One-Hot-Encoding your categorical features, the OneHotEncoder is asked to transform the 'healthy' column (as present in cat_col), the same column not being present on X_train.
A possible way of solving the problem might be defining a separate pipeline to deal with HealthyAttributeAdder and prepend its application with respect to your ColumnTransformer's instance transform_pipeline.
class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
def fit(self, X , y=None):
return self
def transform(self, X):
#X = X.copy()
temp_cols = X.columns.to_list()
temp_cols.append('healthy')
new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
healthy = X.apply(lambda row: determine_healthy(row), axis=1)
combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
return combined_df
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
transform_pipeline = ColumnTransformer([
#('healthy', HealthyAttributeAdder(), all_col),
('ss', StandardScaler(), num_col),
('ohe', OneHotEncoder(drop='first'), cat_col),
])
healthy_pipeline = Pipeline([
('healthy', HealthyAttributeAdder())
])
price_pipeline = Pipeline([
('add_healthy', healthy_pipeline),
('transform', transform_pipeline),
('lasso',Lasso())
])
price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)
Like so, the output of the first step (add_healthy) of your price_pipeline will add the healthy column to X_train first; then this transformed X_train will be passed parallely to both StandardScaler() and OneHotEncoder() and - in particular - OneHotEncoder() won't have any problems in One-Hot-Encoding column 'healthy'.
Related
directory = "path/to/directory"
filename = "C:\\Users\\home\\Desktop\\Python Projects\\TelcomCustomer-Churn_2.csv"
full_path = os.path.join(directory, filename)
def load_csv(path='C:\\Users\\home\\Desktop\\Python Projects\\TelcomCustomer-Churn_2.csv'):
df = pd.read_csv('C:\\Users\\home\\Desktop\\Python Projects\\TelcomCustomer-Churn_2.csv')
return df
def preprocess_data(df):
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float')
# Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include='number').columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Fill in missing numeric values with the mean of the column
for col in numeric_cols:
df[col].fillna(df[col].mean(), inplace=True)
# Fill in missing categorical values with the mode of the column
for col in categorical_cols:
df[col].fillna(df[col].mode()[0], inplace=True)
# Drop duplicates
df.drop_duplicates(inplace=True)
# One-hot encode categorical features
X_cat = pd.get_dummies(df, columns=categorical_cols)
X = X_cat.drop(columns='Churn_Yes', axis=1)
y = X_cat['Churn_Yes']
return X, y
def Split_data(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test
)
print(Split_data(X, y))
the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_592\3782283586.py in <module>
----> 1 print(Split_data(X, y))
~\AppData\Local\Temp\ipykernel_592\18160545.py in Split_data(X, y)
5 # Scale the features
6 scaler = StandardScaler()
----> 7 X_train_scaled = scaler.fit_transform(X_train)
8 X_test_scaled = scaler.transform(X_test)
9 return X_train_scaled, X_test_scaled, y_train, y_test
~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
804 # Reset internal state before fitting
805 self._reset()
--> 806 return self.partial_fit(X, y, sample_weight)
807
808 def partial_fit(self, X, y=None, sample_weight=None):
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
839 """
840 first_call = not hasattr(self, "n_samples_seen_")
--> 841 X = self._validate_data(
842 X,
843 accept_sparse=("csr", "csc"),
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
~\anaconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
2062
2063 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2064 return np.asarray(self._values, dtype=dtype)
2065
2066 def __array_wrap__(
ValueError: could not convert string to float: '4223-BKEOR'
I did not understand where i have done the mistake in the above code
The error happend because one of the data in your X_train contains non numeric value.
Try the below and see if it works:
numeric_cols = X_train.select_dtypes(include='number').columns.tolist()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])
It happens beacause,your dataset may contains some of the column as object type, as python by default allocated memory in heap. So check the datatypes of column in dataframe by dataset_Name.dtypes and make sure to typecast it into float64.
i want to establish a pipe line to pubg data on kaggle to procces it but when i implement a pipe line this error get to me:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
/tmp/ipykernel_35/3879657662.py in <module>
8 ])
9
---> 10 pubg_num_tr = num_pipeline.fit_transform(pubg_num)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
424 """
425 fit_params_steps = self._check_fit_params(**fit_params)
--> 426 Xt = self._fit(X, y, **fit_params_steps)
427
428 last_step = self._final_estimator
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
--> 355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
845 if y is None:
846 # fit method of arity 1 (unsupervised transformation)
--> 847 return self.fit(X, **fit_params).transform(X)
848 else:
849 # fit method of arity 2 (supervised transformation)
/tmp/ipykernel_35/2077244363.py in transform(self, X)
13 total_distance = X[:, walkDistance_ix] + X[:, rideDistance_ix]+X[:, swimDistance_ix]
14 if self.add_total_distance_per_seconda:
---> 15 add_total_distance_per_seconda = X[:, total_distance] / X[:, matchDuration_ix]
16 return np.c_[X, walk_distance_per_seconda, total_distance,
17 add_total_distance_per_seconda]
IndexError: arrays used as indices must be of integer (or boolean) type
my pipeline code is:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
pubg_num_tr = num_pipeline.fit_transform(pubg_num)
i implemented an attribute adder and it worked properly but when i turn on the pipline it fails, i need a solution without the need to converse a float to integers because it harms data.
I've got a dataset with multiple text columns and a target column. I'm trying to use a Cusom Class of Spacy to use Glove embeddings for my text column, and also trying to do it with a Pipeline. But I'm getting a ValueError. Following is my code:
data_features = df.copy()[["title", "description"]]
train_data, test_data, train_target, test_target = train_test_split(data_features, df['target'], test_size = 0.1)
I created this custom class to use glove embeddings. I got the code from this tutorial.
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return [self.nlp(text).vector for text in X]
Loading the nlp model:
nlp = spacy.load("en_core_web_sm")
This is the column transformer that I'm trying to use in my pipeline:
col_preprocessor = ColumnTransformer(
[
('title_glove', SpacyVectorTransformer(nlp), 'title'),
('description_glove', SpacyVectorTransformer(nlp), 'description'),
],
remainder='drop',
n_jobs=1
)
Here is my pipeline:
pipeline_glove = Pipeline([
('col_preprocessor', col_preprocessor),
('classifier', LogisticRegression())
])
When I run the fit method, I get the error that follows:
pipeline_glove.fit(train_data, train_target)
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-219-8543ea744205> in <module>
----> 1 pipeline_glove.fit(train_data, train_target)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
549
550 self._update_fitted_transformers(transformers)
--> 551 self._validate_output(Xs)
552
553 return self._hstack(list(Xs))
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_output(self, result)
410 raise ValueError(
411 "The output of the '{0}' transformer should be 2D (scipy "
--> 412 "matrix, array, or pandas DataFrame).".format(name))
413
414 def _validate_features(self, n_features, feature_names):
ValueError: The output of the 'title_glove' transformer should be 2D (scipy matrix, array, or pandas DataFrame).
the error message tells you, what you need to fix.
ValueError: The output of the 'title_glove' transformer should be 2D
(scipy matrix, array, or pandas DataFrame).
But what you are returning with your current transformer (SpacyVectorTransformer) is a list. You can fix it, by turning the list into a pandas DataFrame for instance like this:
import pandas as pd
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return pd.DataFrame([self.nlp(text).vector for text in X])
Next time, please also provide a minimal, reproducible example. In your provided code, there are no imports as well as no DataFrame called "df".
I need your help!
I've been getting a ValueError below when trying to fit my Pipeline.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference = self.col_1 - self.col_2
return difference.values
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Trying to fit my Pipeline throws the ValueError described above
survey_model.fit(data, cycle_2_score.astype(int))
Some additional context: I'm building this model to have its predict_proba method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Finally, below is the full error raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-218-046724d81b69> in <module>()
----> 1 survey_model.fit(data, cycle_2_score.astype(int))
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
919
920 if any(sparse.issparse(f) for f in Xs):
--> 921 Xs = sparse.hstack(Xs).tocsr()
922 else:
923 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
Further, the data and metadata can be gotten here
%%bash
mkdir data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-train.csv -nc -P ./ml-data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-metadata.csv -nc -P ./ml-data
Changing my TimeDeltaConverter seems to have helped.
Firstly by changing it to be a series of ints and then reshaping it to be reshape(-1,1).
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)
I've been getting a ValueError below when trying to submit my Pipeline into a grader. And I'm not sure where I'm supposed to shave off 12500 rows of data.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Fitting my pipeline produces no error
survey_model.fit(data, cycle_2_score.astype(int))
# Calling the predict function and passing it into the grader raises a ValueError
grader.score.ml__survey_model(survey_model.predict)
The fitted pipeline looks like this
Pipeline(memory=None,
steps=[('features',
FeatureUnion(n_jobs=None,
transformer_list=[('business',
FeatureUnion(n_jobs=None,
transformer_list=[('simple',
Pipeline(memory=None,
steps=[('cst',
ColumnSelectTransformer(columns=['BEDCERT',
'RESTOT',
'INHOSP',
'CCRC_FACIL',
'SFF',
'CHOW_LAST_12MOS',
'SPRINKLER_STATUS',
'EXP_TOTAL',
'ADJ_TOTAL'])),
('imputer',
SimpleImpute...
transformer_weights=None, verbose=False)),
('forest',
RandomForestRegressor(bootstrap=True, criterion='mse',
max_depth=None, max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
verbose=False)
Some additional context: I'm building this model to have its predict method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Finally, below is the full error raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-165-790ca6139493> in <module>()
----> 1 grader.score.ml__survey_model(survey_model.predict)
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in func(*args, **kw)
92 def __getattr__(self, method):
93 def func(*args, **kw):
---> 94 return self(method, *args, **kw)
95 return func
96
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in __call__(self, question_name, func)
88 return
89 test_cases = json.loads(resp.text)
---> 90 test_cases_grading(question_name, func, test_cases)
91
92 def __getattr__(self, method):
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in test_cases_grading(question_name, func, test_cases)
40 for test_case in test_cases:
41 if inspect.isroutine(func):
---> 42 sub_res = func(*test_case['args'], **test_case['kwargs'])
43 elif not test_case['args'] and not test_case['kwargs']:
44 sub_res = func
/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
423
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in transform(self, X)
963 return np.zeros((X.shape[0], 0))
964 if any(sparse.issparse(f) for f in Xs):
--> 965 Xs = sparse.hstack(Xs).tocsr()
966 else:
967 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
Fixing my TimeDeltaTransformer helped.
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
timedelta_series = (pd.to_datetime(X[self.t1_col]) - pd.to_datetime(X[self.t2_col]))
array_list = []
for x in timedelta_series:
array_list.append(x.total_seconds())
return np.array(array_list).reshape(-1,1)