Trying to implement the decision tree regressor algorithm on some training data but when I call fit() I get an error.
(trainingData, testData) = data.randomSplit([0.7, 0.3])
vecAssembler = VectorAssembler(inputCols=["_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9", "_10"], outputCol="features")
dt = DecisionTreeRegressor(featuresCol="features", labelCol="_11")
dt_model = dt.fit(trainingData)
Generates the error
File "spark.py", line 100, in <module>
main()
File "spark.py", line 87, in main
dt_model = dt.fit(trainingData)
File "/opt/spark/python/pyspark/ml/base.py", line 132, in fit
return self._fit(dataset)
File "/opt/spark/python/pyspark/ml/wrapper.py", line 295, in _fit
java_model = self._fit_java(dataset)
File "/opt/spark/python/pyspark/ml/wrapper.py", line 292, in _fit_java
return self._java_obj.fit(dataset._jdf)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/spark/python/pyspark/sql/utils.py", line 79, in deco
raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.IllegalArgumentException: u'requirement failed: Column features must be of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
But the data structures are exactly the same.
You are missing two steps. 1. transformation part, and 2. selecting features and label from the transformed data. I assume the data contains only numerical data, i.e. no categorical data. I am going to write down a generic flow of training a model using pyspark.ml to help you.
from pyspark.ml.feature
from pyspark.ml.classification import DecisionTreeClassifier
#date processing part
vecAssembler = VectorAssembler(input_cols=['col_1','col_2',...,'col_10'],outputCol='features')
#you missed these two steps
trans_data = vecAssembler.transform(data)
final_data = trans_data.select('features','col_11') #your label column name is col_11
train_data, test_data = final_data.randomSplit([0.7,0.3])
#ml part
dt = DecisionTreeClassifier(featuresCol='features',labelCol='col_11')
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)
#proceed with the model evaluation part after this
Related
Those are the complete errors, and I am so confused about what it is trying to say. I was doing a tutorial from google, and I almost completely copy the code from the tutorial but replace its dataset with my own dataset, and those errors occur.
Traceback (most recent call last):
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\pandas\core\indexes\base.py", line 2897, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Objective 1'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/julia/Anaconda/envs/myenv/Mycode.py", line 333, in <module>
validation_targets=validation_targets)
File "C:/Users/julia/Anaconda/envs/myenv/Mycode.py", line 288, in train_nn_regression_model
steps=steps_per_period
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 367, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1158, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1185, in _train_model_default
input_fn, ModeKeys.TRAIN))
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1022, in _get_features_and_labels_from_input_fn
self._call_input_fn(input_fn, mode))
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1113, in _call_input_fn
return input_fn(**kwargs)
File "C:/Users/julia/Anaconda/envs/myenv/Mycode.py", line 268, in <lambda>
training_targets["Objective 1"],
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\pandas\core\frame.py", line 2980, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\julia\Anaconda\envs\myenv\lib\site-packages\pandas\core\indexes\base.py", line 2899, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Objective 1'
I was a machine learning beginner, and when I use python and follow the codes from google machine learning tutorial, some errors came up, and I am not sure what is going on.
# Step 1 - Set up and import necessary packages
from __future__ import print_function
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
# Step 2 - Load our data
zerlite_13X_error = pd.read_csv("zerlite_13x_error.csv", sep=",")
# print(zerlite_13X_error.head()) # Load data done
# We will randomize data. just to be sure not to get any pathological ordering effects the
# performance of Stochastic Gradient Descent. And we first consider objective 1
zerlite_13X_error = zerlite_13X_error.reindex(
np.random.permutation(zerlite_13X_error.index))
# Define features and Configure columns
# Define features which are parameters 1 to parameters 8
def preprocess_features(zerlite_13X_error):
"""Prepares input features from zerlite_13X_error
Args:
zerlite_13X_error: A Pandas DataFrame expected to contain data
Return:
A DataFrame that contains the features to be used for the model.
including synthetic features
"""
selected_features = zerlite_13X_error[
["Parameter 1",
"Parameter 2",
"Parameter 3",
"Parameter 4",
"Parameter 5",
"Parameter 6",
"Parameter 7",
"Parameter 8"]]
processed_features = selected_features.copy()
# print(processed_features.head())
return processed_features
def preprocess_targets(zerlite_13X_error):
"""Prepares target features (i.e. labels) from zerlite_13X_error set
Args:
zerlite_13X_error: A Panda dataframe that was expected to contain data from
the zerolite_13X_error data set
Returns:
A dataframe that contains the target feature
"""
output_targets = pd.DataFrame()
# Create the output targets
output_targets["Objective 1"] = zerlite_13X_error["Objective 1"]
print(output_targets.head())
return output_targets
# For training Set, we will choose 14000 out of 20154 number, about 70% of data as training set
training_examples = preprocess_features(zerlite_13X_error.head(14000))
training_examples.describe()
print('-- Training Examples Describe --')
print(training_examples.describe())
training_targets = preprocess_targets(zerlite_13X_error.head(14000))
training_targets.describe()
print('-- Training Targets Describe --')
print(training_targets.describe())
# For Validation Set, we will choose 3000 examples, out of total 20154 examples
validation_examples = preprocess_features(zerlite_13X_error.iloc[14001:17001])
validation_examples.describe()
print('-- Validation Examples Describe --')
print(validation_examples.describe())
validation_targets = preprocess_targets(zerlite_13X_error.iloc[14001:17001])
validation_targets.describe()
print('-- Validation Targets Describe --')
print(validation_targets.describe())
# for Test Set, we will choose the last 3154 examples
test_examples = preprocess_features((zerlite_13X_error.tail(3154)))
test_examples.describe()
print('-- Test Examples Describe --')
print(test_examples.describe())
test_targets = preprocess_targets(zerlite_13X_error.tail(3154))
test_targets.describe()
print('-- Test Targets Describe --')
print(test_targets.describe())
# As we are now working with multiple features, modularize the code for configuring columns into a
# separate function
def construct_feature_columns(input_features):
"""Construct the TensorFlow columns:
Args:
input_features: The name of numerical input features to use
Returns:
A set of feature columns
"""
return set([tf.feature_column.numeric_column(my_feature)
for my_feature in input_features])
# Train and evaluate the model
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
"""Trains a linear regression model of multiple features
Args:
features: pandas DataFrame of features
targets: pandas DataFrame of targets
batch_size: Size of batches to be passed to the model
shuffle: True or False. Whether to shuffle the data
num_epochs: Number of epochs for which data should be repeated. None = Repeat indefinitely
Returns:
Tuple of (features, labels) for next data batch
"""
# Convert pandas data into a dict of np arrays
features = {key: np.array(value) for key, value in dict(features).items()}
# Construct a dataset, and configure batching/repeating
ds = Dataset.from_tensor_slices((features, targets)) # Warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
# Shuffle the data, if specified
if shuffle:
ds = ds.shuffle(10000)
# Return the next batch of data
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
# Now we will go creating a train model using neural network
def train_nn_regression_model(learning_rate, steps, batch_size, hidden_units,
training_examples, training_targets,
validation_examples, validation_targets):
"""Trains a neural network regression model of multiple features
In addition to training, this function also prints training progress information,
as well as plot of the training and validation loss over time
Args:
learning_rate: A 'float', the learning rate
steps: A non-zero 'int', the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero 'int', the batch size.
hidden_size" A 'list' of int values, specifying the number of neurons in each layer
training_examples: A 'DataFrame' containing one or more columns from
'zerlite_13X_error' to use as input features for training
training_targets: A 'DataFrame' containing exactly one column from
'zerlite_13X_error' to use as target for training
validation_examples: A 'DataFrame' containing one or more columns from
'zerlite_13X_error' to use as input features for validation
validation_targets: A 'DataFrame' containing exactly one column from
'zerlite_13X_error' to use as target for validation
Returns:
A 'DNNRegressor' object trained on the training data.
"""
periods = 10
steps_per_period = steps / periods
# Create a DNNRegressor Object
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
dnn_regressor = tf.estimator.DNNRegressor(
feature_columns=construct_feature_columns(training_examples),
hidden_units=hidden_units,
optimizer=my_optimizer,
)
# Create input functions.
training_input_fn = lambda: my_input_fn(training_examples,
training_targets["Objective 1"],
batch_size=batch_size)
predict_training_input_fn = lambda: my_input_fn(training_examples,
training_targets["Objective 1"],
num_epochs=1,
shuffle=False)
predict_validation_input_fn = lambda: my_input_fn(validation_examples,
validation_targets["Objective 1"],
num_epochs=1,
shuffle=False)
# Train the model, but do so inside a loop so that we can periodically assess loss metrics
print("Training Models ............")
print("RMSE (on training data): ")
training_rmse = []
validation_rmse = []
for period in range(0, periods): # Python shows error occuring here
# Train the model, starting from the prior state
dnn_regressor.train(
input_fn=training_input_fn,
steps=steps_per_period)
# take a break and compute predictions
training_predictions = dnn_regressor.predict(input_fn=predict_training_input_fn)
training_predictions = np.array([item['predictions'][0] for item in training_predictions])
validation_predictions = dnn_regressor.predict(input_fn=predict_validation_input_fn)
validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
# Compute training and validation loss
training_root_mean_squared_error = math.sqrt(
metrics.mean_squared_error(training_predictions, training_targets))
validation_root_mean_squared_error = math.sqrt(
metrics.mean_squared_error(validation_predictions, validation_targets))
# Occasionally print the current loss
print(" period %02d: %02f" % (period, training_root_mean_squared_error))
# Add the loss metrics from this period to our list
training_rmse.append(training_root_mean_squared_error)
validation_rmse.append(validation_root_mean_squared_error)
print("Model training finished")
# Output a graph of loss metrics over periods
plt.ylabel("RMSE")
plt.xlabel("Periods")
plt.title("Root Mean Squared Error v.s. Periods")
plt.tight_layout()
plt.plot(training_rmse, label="training")
plt.plot(validation_rmse, label="validation")
plt.legend()
print("Final RMSE (on training data): %0.2f" % training_root_mean_squared_error)
print("Final RMSE (on validation data): %0.2f" % validation_root_mean_squared_error)
return dnn_regressor
# Train NN model
dnn_regressor = train_nn_regression_model(
learning_rate=0.1,
steps=5000,
batch_size=10,
hidden_units=[10, 2],
training_examples=training_examples,
training_targets=training_targets,
validation_examples=validation_examples,
validation_targets=validation_targets) # Python shows error here
I'm new to Python and TensorFlow and I'm trying to build a simple working example with fake data in TensorFlow. My goal is to use the DNNRegressor estimator to predict a real value from a multidimensional input. This is the code I wrote:
import pandas as pd
import tensorflow as tf
import numpy as np
# Amount of train samples
m_train = 1000
# Amount of test samples
m_test = 100
# Dimensions for each sample
n = 10
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
# Create random samples with numpy
train_data = (np.random.sample((m_train,n)), np.random.sample((m_train,1)))
test_data = (np.random.sample((m_test,n)), np.random.sample((m_test,1)))
# Create two datasets, one for trainning and the other for testing
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
test_dataset = tf.data.Dataset.from_tensor_slices(test_data)
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=n)]
model = tf.estimator.DNNRegressor(hidden_units=[20, 20], feature_columns=feature_columns)
# Train the model
model.train(input_fn=from_dataset(train_dataset), steps=1000)
# Evaluate the unseen samples
eval_result = model.evaluate(input_fn=from_dataset(test_dataset))
And this is the error I get:
$ python fake.py
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmp1j5irF
Traceback (most recent call last):
File "fake.py", line 28, in <module>
model.train(input_fn=from_dataset(train_dataset), steps=1000)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 314, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 743, in _train_model
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 725, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/canned/dnn.py", line 448, in _model_fn
config=config)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/canned/dnn.py", line 153, in _dnn_model_fn
'Given type: {}'.format(type(features)))
ValueError: features should be a dictionary of `Tensor`s. Given type: <class 'tensorflow.python.framework.ops.Tensor'>
I supose I have to use a dictionary of Tensors, but I'm just beginning in Python and I don't know how to do it.
You need to return the iterator returned by get_one(), rather than a lambda function that returns the iterator. Check out https://github.com/tensorflow/tensorflow/blob/r1.8/tensorflow/examples/get_started/regression/dnn_regression.py
I have a simple sklearn class I would like to use as part of an sklearn pipeline. This class just takes a pandas dataframe X_DF and a categorical column name, and calls pd.get_dummies to return the dataframe with the column turned into a matrix of dummy variables...
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
class dummy_var_encoder(TransformerMixin, BaseEstimator):
'''Convert selected categorical column to (set of) dummy variables
'''
def __init__(self, column_to_dummy='default_col_name'):
self.column = column_to_dummy
print self.column
def fit(self, X_DF, y=None):
return self
def transform(self, X_DF):
''' Update X_DF to have set of dummy-variables instead of orig column'''
# convert self-attribute to local var for ease of stepping through function
column = self.column
# add columns for new dummy vars, and drop original categorical column
dummy_matrix = pd.get_dummies(X_DF[column], prefix=column)
new_DF = pd.concat([X_DF[column], dummy_matrix], axis=1)
return new_DF
Now using this transformer on it's own to fit/transform, I get output as expected. For some toy data as below:
from sklearn import datasets
# Load toy data
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = pd.Series(iris.target, name='y')
# Create Arbitrary categorical features
X['category_1'] = pd.cut(X['sepal length (cm)'],
bins=3,
labels=['small', 'medium', 'large'])
X['category_2'] = pd.cut(X['sepal width (cm)'],
bins=3,
labels=['small', 'medium', 'large'])
...my dummy encoder produces the correct output:
encoder = dummy_var_encoder(column_to_dummy = 'category_1')
encoder.fit(X)
encoder.transform(X).iloc[15:21,:]
category_1
category_1 category_1_small category_1_medium category_1_large
15 medium 0 1 0
16 small 1 0 0
17 small 1 0 0
18 medium 0 1 0
19 small 1 0 0
20 small 1 0 0
However, when I call the same transformer from an sklearn pipeline as defined below:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
# Define Pipeline
clf = LogisticRegression(penalty='l1')
pipeline_steps = [('dummy_vars', dummy_var_encoder()),
('clf', clf)
]
pipeline = Pipeline(pipeline_steps)
# Define hyperparams try for dummy-encoder and classifier
# Fit 4 models - try dummying category_1 vs category_2, and using l1 vs l2 penalty in log-reg
param_grid = {'dummy_vars__column_to_dummy': ['category_1', 'category_2'],
'clf__penalty': ['l1', 'l2']
}
# Define full model search process
cv_model_search = GridSearchCV(pipeline,
param_grid,
scoring='accuracy',
cv = KFold(),
refit=True,
verbose = 3)
All's well until I fit the pipeline, at which point I get an error from the dummy encoder:
cv_model_search.fit(X,y=y)
In [101]: cv_model_search.fit(X,y=y) Fitting 3 folds for each of 4
candidates, totalling 12 fits
None None None None
[CV] dummy_vars__column_to_dummy=category_1, clf__penalty=l1 .........
Traceback (most recent call last):
File "", line 1, in
cv_model_search.fit(X,y=y)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/model_selection/_search.py",
line 638, in fit
cv.split(X, y, groups)))
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 779, in call
while self.dispatch_one_batch(iterator):
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 625, in dispatch_one_batch
self._dispatch(tasks)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py",
line 111, in apply_async
result = ImmediateResult(func)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py",
line 332, in init
self.results = batch()
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 131, in call
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/model_selection/_validation.py",
line 437, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/pipeline.py",
line 257, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/pipeline.py",
line 222, in _fit
**fit_params_steps[name])
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/memory.py",
line 362, in call
return self.func(*args, **kwargs)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/pipeline.py",
line 589, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/base.py",
line 521, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "", line 21, in transform
dummy_matrix = pd.get_dummies(X_DF[column], prefix=column)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/frame.py",
line 1964, in getitem
return self._getitem_column(key)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/frame.py",
line 1971, in _getitem_column
return self._get_item_cache(key)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/generic.py",
line 1645, in _get_item_cache
values = self._data.get(item)
File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/internals.py",
line 3599, in get
raise ValueError("cannot label index with a null key")
ValueError: cannot label index with a null key
The trace is telling you exactly what went wrong. Learning to diagnose the trace is really quite invaluable especially when you are inheriting from libraries you might not have a complete understanding of.
Now, I have done a fair bit of inheriting in sklearn myself and I can tell you without a doubt GridSearchCV is going to give you some trouble if the type of data input into your fit or fit_transform methods are not NumPy arrays. As Vivek mentioned in his comment the X getting passed to your fit method is no longer a DataFrame. But let's take a look at the trace first.
ValueError: cannot label index with a null key
While Vivek is correct about the NumPy array you have another problem here. The actual error you get is that the value of column in your fit method is None. If you were to look at your encoder object above you would see the __repr__ method outputs the following:
dummy_var_encoder(column_to_dummy=None)
When using Pipeline, this param is what gets initialized and passed along to GridSearchCV. This is a behavior that can be seen throughout cross validation and search methods as well, and having attributes with different names from the input parameter causes issues like this. Fixing this will start you down the right path.
Modifying the __init__ method as such will solve this specific issue:
def __init__(self, column='default_col_name'):
self.column = column
print(self.column)
However, once you have done this the issue Vivek mentioned will rear it's head and you will have to deal with that. This is something I have run into before, though not with DataFrames specifically. I came up with a solution in Use sklearn GridSearchCV on custom class whose fit method takes 3 arguments. Basically I created a wrapper that implements the __getitem__ method in a way that makes the data look and behave in a way that it will pass the validation methods used in GridSearchCV, Pipeline, and other cross validation methods.
Edit
I made these changes and it looks like your problem then comes from the validation method check_array. While calling this method with dtype=pd.DataFrame would work, the linear model calls this with dtype=np.float64 throwing an error. To get around this instead of concatenating the original data with you dummies you could just return your dummy columns and fit using those. This is something that should be done anyway since you wouldn't want to include both dummy columns and the original data in the model you are trying to fit. You may also consider the drop_first option, but I'm getting off subject. So, changing your fit method like so allows the whole process to work as expected.
def transform(self, X_DF):
''' Update X_DF to have set of dummy-variables instead of orig column'''
# convert self-attribute to local var for ease of stepping through function
column = self.column
# add columns for new dummy vars, and drop original categorical column
dummy_matrix = pd.get_dummies(X_DF[column], prefix=column)
return dummy_matrix
Been plagued by this bug for a while now and could use some hive-mind help to (hopefully) catch something I'm missing.
tl;dr version
Pandas is raising a TypeError when I pass scikit-learn GridSearchCV ndarrays of floats to train on.
full version
I'm using an sklearn GridSearchCV object to fit a 1D list of float target variables scaled between 0.0 and 1.0 to a 2D list of float feature variables (input variables) also scaled between 0 and 1. Both lists have the same number of samples.
The abbreviated code where I pass my training data to GridSearchCV.fit() is as follows:
# the training data are attributes of a model class defined elsewhere
model.X_scaled # ndarray of feature data of shape (N_samples, N_features)
model.Y_scaled # ndarray target data of length N_samples
# setup the GridSearchCV instance
search = grid_search.GridSearchCV(estimator = svr,
param_grid = params, # C and epsilon parameters are set elsewhere in a dict
n_jobs = self.parallel_processes,
scoring = self.scoring_metric,
cv = np.shape(self.Y)[0], # sets the fold size for cross-val. cv = # of samples is essentially LOO CV.
verbose = 0)
# print out some info to get a better idea of the training data
print "model.X_scaled"
print type(model.X_scaled[0][0]) # this is getting the type of one of the elements of X_scaled
print np.shape(model.X_scaled)
print model.X_scaled.tolist()
print "model.Y_scaled"
print type(model.Y_scaled) # this is getting the type of the entire arry structure
print np.shape(model.Y_scaled)
print model.Y_scaled
# fit GridSearchCV on the training data
search.fit(model.X_scaled, model.Y_scaled)
When this code is run, I get the resulting output:
model.X_scaled
<type 'numpy.float64'>
(81, 16)
model.Y_scaled
<type 'numpy.ndarray'>
(81,)
Traceback (most recent call last):
File "SurrogateModel.py", line 416, in <module>
self_test()
File "SurrogateModel.py", line 409, in self_test
model.train()
File "SurrogateModel.py", line 350, in train
search.fit(model.X_scaled, model.Y_scaled)
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/grid_search.py", line 804, in fit
return self._fit(X, y, ParameterGrid(self.param_grid))
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
self.results = batch()
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/svm/base.py", line 193, in fit
fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/sklearn/svm/base.py", line 251, in _dense_fit
max_iter=self.max_iter, random_seed=random_seed)
File "sklearn/svm/libsvm.pyx", line 59, in sklearn.svm.libsvm.fit (sklearn/svm/libsvm.c:1576)
File "/home/jack/Software/Python/anaconda2/envs/xs1opt/lib/python2.7/site-packages/pandas/core/series.py", line 78, in wrapper
"{0}".format(str(converter)))
TypeError: cannot convert the series to <type 'float'>
The stack trace is a bit confusing to me. sklearn makes it all the way to the "fit" call to libsvm, but then a Pandas package raises a TypeError because it can't convert a series object to a float? I looked in the pandas series.py module and found additional context for where the TypeError is being raised:
# in pandas/core/series.py
def _coerce_method(converter):
""" install the scalar coercion methods """
def wrapper(self):
if len(self) == 1:
return converter(self.iloc[0])
raise TypeError("cannot convert the series to "
"{0}".format(str(converter)))
return wrapper
I don't understand how this Pandas function is being called when neither of the data structures I'm passing to scikit-learn are pandas objects. In the code I've provided, they're both ndarrays, but I've tried passing them as plain lists only to get the same TypeError. Pandas is used earlier in the code to read data from a csv into a DataFrame, but the data is converted to ndarrays before being put into X_scaled and Y_scaled.
The annoying thing is that this very nearly exact same code runs perfectly fine in the script which preceded this one. The version of the code in which I have this problem is basically refactored from the script, but this functionality, training a gridsearch object on the training data, remains mostly unchanged.
Any suggestions on what might be happening here are greatly appreciated. Thank you!
I am new in python and I am trying to developp a program with Gradient Boosting Regressor.
I have two big sets of data, one training set and one test set in wich I have exactly the same columns. My goal is to predict the SeriousDlqin2yrs column of the test set with the information of the training set.
This is the program I wrote :
import numpy as np
import csv as csv
import pandas as pd
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.utils import shuffle
# Load data
csv_file_object = csv.reader(open('cs-training-cleandata2NOLOG.csv', 'rb')) #Load in the training csv file
header = csv_file_object.next() #Skip the fist line as it is a header
train_data=[] #Creat a variable called 'train_data'
for row in csv_file_object: #Skip through each row in the csv file
train_data.append(row[1:]) #adding each row to the data variable
train_data = np.array(train_data) #Then convert from a list to an array
test_file_object = csv.reader(open('cs-test-cleandata2NOLOG.csv', 'rb')) #Load in the test csv file
header = test_file_object.next() #Skip the fist line as it is a header
test_data=[] #Creat a variable called 'test_data'
ids = []
for row in test_file_object: #Skip through each row in the csv file
ids.append(row[0])
test_data.append(row[1:]) #adding each row to the data variable
test_data = np.array(test_data) #Then convert from a list to an array
test_data = np.delete(test_data,[0],1) #remove SeriousDlqin2yrs
print 'Training '
# Fit regression model
clf = GradientBoostingRegressor(n_estimators=1000, min_samples_split=100, learning_rate=0.01)
clf = clf.fit(train_data[0::,1::],train_data[0::,0])
print 'Predicting'
output=clf.predict(test_data)
open_file_object = csv.writer(open("GradientBoostedRegression1.1.csv", "wb"))
open_file_object.writerow(["Id","Probability"])
open_file_object.writerows(zip(ids, output))
But when I run the program, python give me this answer :
Traceback (most recent call last):
File "C:\Users\Paul HONORE\Dropbox\Research Study\Kaggle\Bank\GradientBoostedRegression1.1.py", line 64, in <module>
clf = clf.fit(train_data[0::,1::],train_data[0::,0])
File "C:\Python27\lib\site-packages\sklearn\ensemble\gradient_boosting.py", line 1126, in fit
return super(GradientBoostingRegressor, self).fit(X, y)
File "C:\Python27\lib\site-packages\sklearn\ensemble\gradient_boosting.py", line 595, in fit
self.init_.fit(X, y)
File "C:\Python27\lib\site-packages\sklearn\ensemble\gradient_boosting.py", line 69, in fit
self.mean = np.mean(y)
File "C:\Python27\lib\site-packages\numpy\core\fromnumeric.py", line 2716, in mean
out=out, keepdims=keepdims)
File "C:\Python27\lib\site-packages\numpy\core\_methods.py", line 62, in _mean
ret = um.add.reduce(arr, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
TypeError: cannot perform reduce with flexible type
I don't know from where it comes, i read lots of paper about this question but never found a solution for this particular problem.
Thank you in advance for your help.
i think the problem will be solved by specifying a type in the array function.
for instance:
train_data = np.array(train_data, dtype = 'float_')