It's my first post on stakcoverflow because I don't find any clue to solve this message "'PipelinedRDD' object has no attribute '_jdf'" that appear when I call trainer.fit on my train dataset to create a neural network model under Spark in Python
here is my code
from pyspark import SparkContext
from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
### Import data in Spark ###
RDD_RAWfileWH= sc.textFile("c:/Anaconda2/Cognet/Data_For_Cognet_ready.csv")
header = RDD_RAWfileWH.first()
# Delete header from RAWData
RDD_RAWfile1 = RDD_RAWfileWH.filter(lambda x: x != header)
# Split each line of the RDD
RDD_RAWfile = RDD_RAWfile1.map(lambda line:[float(x) for x in line.split(',')])
FinalData = RDD_RAWfile.map(lambda row: LabeledPoint(row[0],[row[1:]]))
(trainingData, testData) = FinalData.randomSplit([0.7, 0.3])
layers = [15, 2, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
# train the model
model = trainer.fit(trainingData)
and the trace
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-28-123dce2b085a> in <module>()
46 trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
47 # train the model
---> 48 model = trainer.fit(trainingData)
49 # compute accuracy on the test set
50 # result = model.transform(test)
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\pipeline.pyc in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit(self, dataset)
131
132 def _fit(self, dataset):
--> 133 java_model = self._fit_java(dataset)
134 return self._create_model(java_model)
135
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit_java(self, dataset)
128 """
129 self._transfer_params_to_java()
--> 130 return self._java_obj.fit(dataset._jdf)
131
132 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
I'am not an expert on Spark so If anyone know what is this jdf attribute and how to solve this issue it will be very helpfull for me.
thanks a lot
Related
I am trying to use a stacking classifier along with 3 base learners of random forest, boosting and SVM and 1 meta learner of logistic regression.
However I keep getting this error message.
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
rf = RandomForestClassifier(n_estimators=100,random_state=100)
gb = GradientBoostingClassifier(n_estimators=100,random_state=100)
svm = make_pipeline(StandardScaler(), SVC(random_state=100))
estimators = [('RF', rf),
('GB', gb),
('SVM', svm)]
Model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression)
Model.fit(X_train,y_train).score(X_val,y_val)
But I keep getting this error.
TypeError Traceback (most recent call last)
<ipython-input-47-40186fb4189e> in <module>
----> 1 Model.fit(X_train,y_train).score(X_val,y_val)
~\anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py in fit(self, X, y, sample_weight)
437 self._le = LabelEncoder().fit(y)
438 self.classes_ = self._le.classes_
--> 439 return super().fit(X, self._le.transform(y), sample_weight)
440
441 #if_delegate_has_method(delegate='final_estimator_')
~\anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py in fit(self, X, y, sample_weight)
138 # 'drop' string.
139 names, all_estimators = self._validate_estimators()
--> 140 self._validate_final_estimator()
141
142 stack_method = [self.stack_method] * len(all_estimators)
~\anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py in _validate_final_estimator(self)
406
407 def _validate_final_estimator(self):
--> 408 self._clone_final_estimator(default=LogisticRegression())
409 if not is_classifier(self.final_estimator_):
410 raise ValueError(
~\anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py in _clone_final_estimator(self,
default)
55 def _clone_final_estimator(self, default):
56 if self.final_estimator is not None:
--> 57 self.final_estimator_ = clone(self.final_estimator)
58 else:
59 self.final_estimator_ = clone(default)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
62 if isinstance(estimator, type):
63 raise TypeError("Cannot clone object. " +
--> 64 "You should provide an instance of " +
65 "scikit-learn estimator instead of a class.")
66 else:
TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator
instead
of a class.
I am applying this on the titanic Data set to use the power of all the algorithms at my disposal.
I have never used stacked classification or regression before and hence this is my first time.
Thanks and Regards
As #amiola pointed out, you're missing the parenthesis after LogisticRegression which will create a new instance of that class:
Model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression)
should be
Model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
^^
It's not a new question, references I found without any solution working for me first and second.
I'm a newbie to PyTorch, facing AttributeError: 'Field' object has no attribute 'vocab' while creating batches of the text data in PyTorch using torchtext.
Following up the book Deep Learning with PyTorch I wrote the same example as explained in the book.
Here's the snippet:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
TEXT = data.Field(lower=True, batch_first=True, fix_length=20)
LABEL = data.Field(sequential=False)
train, test = datasets.IMDB.splits(TEXT, LABEL)
print("train.fields:", train.fields)
print()
print(vars(train[0])) # prints the object
TEXT.build_vocab(train, vectors=GloVe(name="6B", dim=300),
max_size=10000, min_freq=10)
# VOCABULARY
# print(TEXT.vocab.freqs) # freq
# print(TEXT.vocab.vectors) # vectors
# print(TEXT.vocab.stoi) # Index
train_iter, test_iter = data.BucketIterator.splits(
(train, test), batch_size=128, device=-1, shuffle=True, repeat=False) # -1 for cpu, None for gpu
# Not working (FROM BOOK)
# batch = next(iter(train_iter))
# print(batch.text)
# print()
# print(batch.label)
# This also not working (FROM Second solution)
for i in train_iter:
print (i.text)
print (i.label)
Here's the stacktrace:
AttributeError Traceback (most recent call last)
<ipython-input-33-433ec3a2ca3c> in <module>()
7
8
----> 9 for i in train_iter:
10 print (i.text)
11 print (i.label)
/anaconda3/lib/python3.6/site-packages/torchtext/data/iterator.py in __iter__(self)
155 else:
156 minibatch.sort(key=self.sort_key, reverse=True)
--> 157 yield Batch(minibatch, self.dataset, self.device)
158 if not self.repeat:
159 return
/anaconda3/lib/python3.6/site-packages/torchtext/data/batch.py in __init__(self, data, dataset, device)
32 if field is not None:
33 batch = [getattr(x, name) for x in data]
---> 34 setattr(self, name, field.process(batch, device=device))
35
36 #classmethod
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in process(self, batch, device)
199 """
200 padded = self.pad(batch)
--> 201 tensor = self.numericalize(padded, device=device)
202 return tensor
203
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in numericalize(self, arr, device)
300 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
301 else:
--> 302 arr = [self.vocab.stoi[x] for x in arr]
303
304 if self.postprocessing is not None:
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in <listcomp>(.0)
300 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
301 else:
--> 302 arr = [self.vocab.stoi[x] for x in arr]
303
304 if self.postprocessing is not None:
AttributeError: 'Field' object has no attribute 'vocab'
If not using BucketIterator, what else I can use to get a similar
output?
You haven't built the vocab for the LABEL field.
After TEXT.build_vocab(train, ...), run LABEL.build_vocab(train), and the rest will run.
I am using keras with Tesorflow backend (using python 2.7) and I built two dense layer on top of pretrained vgg16 model with four classes. And I get a pretty decent results on my validation set. Now I want to use lime to interpret my result.
I import the lime package and transform one of my images following the lime github repo https://github.com/marcotcr/lime/blob/master/doc/notebooks/Tutorial%20-%20Image%20Classification%20Keras.ipynb. My path_list contains one photo.
import lime
from lime import lime_image
def transform_img_fn(path_list):
out = []
for img_path in path_list:
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img) / 255
x = np.expand_dims(x, axis=0)
out.append(x)
return np.vstack(out)
check_image = transform_img_fn(path_list)
Then
check_image[0].shape
OUTPUT: (3, 224, 224)
predictions[0]
OUTPUT: array([9.67346e-01, 3.00240e-03, 2.96037e-02, 4.79915e-05], dtype=float32)
explainer = lime_image.LimeImageExplainer()
explanation = explainer.explain_instance(check_image[0], model_top.predict, hide_color=0, num_samples=100)
I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-115-e286e97e849d> in <module>()
----> 1 explanation = explainer.explain_instance(check_image[0], model_top.predict, hide_color=0, num_samples=100)
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/lime_image.pyc in explain_instance(self, image, classifier_fn, labels, hide_color, top_labels, num_features, num_samples, batch_size, segmentation_fn, distance_metric, model_regressor, random_seed)
165 segmentation_fn = SegmentationAlgorithm('quickshift', kernel_size=4,
166 max_dist=200, ratio=0.2,
--> 167 random_seed=random_seed)
168 try:
169 segments = segmentation_fn(image)
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/wrappers/scikit_image.pyc in __init__(self, algo_type, **target_params)
103 if (self.algo_type == 'quickshift'):
104 BaseWrapper.__init__(self, quickshift, **target_params)
--> 105 kwargs = self.filter_params(quickshift)
106 self.set_params(**kwargs)
107 elif (self.algo_type == 'felzenszwalb'):
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/wrappers/scikit_image.pyc in filter_params(self, fn, override)
82 result = {}
83 for name, value in self.target_params.items():
---> 84 if has_arg(fn, name):
85 result.update({name: value})
86 result.update(override)
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/utils/generic_utils.pyc in has_arg(fn, arg_name)
19 else:
20 try:
---> 21 arg_spec = inspect.getargspec(fn.__call__)
22 except AttributeError:
23 return False
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/inspect.pyc in getargspec(func)
813 func = func.im_func
814 if not isfunction(func):
--> 815 raise TypeError('{!r} is not a Python function'.format(func))
816 args, varargs, varkw = getargs(func.func_code)
817 return ArgSpec(args, varargs, varkw, func.func_defaults)
TypeError: <method-wrapper '__call__' of builtin_function_or_method object at 0x7fea20ea4e60> is not a Python function
Based on the documentation, "classifier_fn: function that takes a list of images and returns a matrix of prediction probabilities". I replaced this argument with model_top.predict. I can get all of my predictions if I call predictions = model_top.predict(validation_data, batch_size=32)
Any help would be appreciated.
data = sqlContext.sql("select a.churn,b.pay_amount,c.all_balance from db_bi.t_cust_churn a left join db_bi.t_cust_pay b on a.cust_id=b.cust_id left join db_bi.t_cust_balance c on a.cust_id=c.cust_id limit 5000").cache()
def labelData(df):
return df.map(lambda row: LabeledPoint(row[0], row[1:]))
traindata = labelData(data) --this step works well.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(lrdata)
lrModel = lr.fit(lrdata)
AttributeError Traceback (most recent call last)
<ipython-input-40-b84a106121e6> in <module>()
----> 1 lrModel = lr.fit(lrdata)
/home/hadoop/spark/python/pyspark/ml/pipeline.pyc in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/home/hadoop/spark/python/pyspark/ml/wrapper.pyc in _fit(self, dataset)
131
132 def _fit(self, dataset):
--> 133 java_model = self._fit_java(dataset)
134 return self._create_model(java_model)
135
/home/hadoop/spark/python/pyspark/ml/wrapper.pyc in _fit_java(self, dataset)
128 """
129 self._transfer_params_to_java()
--> 130 return self._java_obj.fit(dataset._jdf)
131
132 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
I guess you are using the tutorial for the latest spark version (2.0.1) with
pyspark.ml.classification import LogisticRegression whereas you need some other version, e.g. 1.6.2 with pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel. Note the different libraries.
I am trying to fit a sample probablistic graphical model using sample data.
While fitting the data in model, I am encountering type error. Sample code is given below:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
data = np.random.uniform(low=0, high=2, size=(1000, 4)).astype('float')
data
data = pd.DataFrame(data, columns=['cost', 'quality',
'location',
'no_of_people'])
train = data[:750]
test = data[750:].drop('no_of_people', axis=1)
restaurant_model = BayesianModel(
[('location', 'cost'),
('quality', 'cost'),
('location', 'no_of_people'),
('cost', 'no_of_people')])
restaurant_model.fit(train)
I am encountering the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-173-8e3a85cb8b56> in <module>()
----> 1 restaurant_model.fit(train)
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\models\BayesianModel.py in fit(self, data, estimator_type)
568 estimator = estimator_type(self, data)
569
--> 570 cpds_list = estimator.get_parameters()
571 self.add_cpds(*cpds_list)
572
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\estimators\MLE.py in get_parameters(self)
64 state_counts = state_counts.reindex(sorted(state_counts.index))
65 cpd = TabularCPD(node, self.node_card[node],
---> 66 state_counts.values[:, np.newaxis])
67 cpd.normalize()
68 parameters.append(cpd)
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\factors\CPD.py in __init__(self, variable, variable_card, values, evidence, evidence_card)
137 raise TypeError("Values must be a 2D list/array")
138
--> 139 super(TabularCPD, self).__init__(variables, cardinality, values.flatten('C'))
140
141 def __repr__(self):
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\factors\Factor.py in __init__(self, variables, cardinality, values)
98
99 if values.dtype != int and values.dtype != float:
--> 100 raise TypeError("Values: Expected type int or type float, got ", values.dtype)
101
102 if len(cardinality) != len(variables):
TypeError: ('Values: Expected type int or type float, got ', dtype('int64'))