Probablistic graphical model Error while fitting the model - python

I am trying to fit a sample probablistic graphical model using sample data.
While fitting the data in model, I am encountering type error. Sample code is given below:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
data = np.random.uniform(low=0, high=2, size=(1000, 4)).astype('float')
data
data = pd.DataFrame(data, columns=['cost', 'quality',
'location',
'no_of_people'])
train = data[:750]
test = data[750:].drop('no_of_people', axis=1)
restaurant_model = BayesianModel(
[('location', 'cost'),
('quality', 'cost'),
('location', 'no_of_people'),
('cost', 'no_of_people')])
restaurant_model.fit(train)
I am encountering the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-173-8e3a85cb8b56> in <module>()
----> 1 restaurant_model.fit(train)
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\models\BayesianModel.py in fit(self, data, estimator_type)
568 estimator = estimator_type(self, data)
569
--> 570 cpds_list = estimator.get_parameters()
571 self.add_cpds(*cpds_list)
572
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\estimators\MLE.py in get_parameters(self)
64 state_counts = state_counts.reindex(sorted(state_counts.index))
65 cpd = TabularCPD(node, self.node_card[node],
---> 66 state_counts.values[:, np.newaxis])
67 cpd.normalize()
68 parameters.append(cpd)
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\factors\CPD.py in __init__(self, variable, variable_card, values, evidence, evidence_card)
137 raise TypeError("Values must be a 2D list/array")
138
--> 139 super(TabularCPD, self).__init__(variables, cardinality, values.flatten('C'))
140
141 def __repr__(self):
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\factors\Factor.py in __init__(self, variables, cardinality, values)
98
99 if values.dtype != int and values.dtype != float:
--> 100 raise TypeError("Values: Expected type int or type float, got ", values.dtype)
101
102 if len(cardinality) != len(variables):
TypeError: ('Values: Expected type int or type float, got ', dtype('int64'))

Related

TextBlob error: too many values to unpack

I am trying to run the following code, but I have gotten an error that are too many values to unpack
The code is:
import csv
import json
import pandas as pd
df = pd.read_csv("job/my_data_frame_test.csv", encoding="utf-8")
df.info()
print(df)
TEXT
text recommended
ABC
yes
DEF
no
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(df)
After running this code, I have the following error (in full)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-3d683b8c482a> in <module>
----> 1 cl = NaiveBayesClassifier(df)
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
203 def __init__(self, train_set,
204 feature_extractor=basic_extractor, format=None, **kwargs):
--> 205 super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
206 self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
207
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
137 else: # train_set is a list of tuples
138 self.train_set = train_set
--> 139 self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words.
140 self.train_features = None
141
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in _get_words_from_dataset(dataset)
61 return words
62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
---> 63 return set(all_words)
64
65 def _get_document_tokens(document):
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in <genexpr>(.0)
60 else:
61 return words
---> 62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
63 return set(all_words)
64
ValueError: too many values to unpack (expected 2)
NaiveBayesClassifier() expects a list of tuples of the form (text, label):
train = list(zip(df['TEXT'], df['text recommended']))
# [('ABC', 'yes'), ('DEF', 'no')]
cl = NaiveBayesClassifier(train)
# <NaiveBayesClassifier trained on 2 instances>

sklearn pipeline transform ValueError that Expected Value is not equal to Trained Value

Can you please help me to with the following function where I got the error of ValueError: Column ordering must be equal for fit and for transform when using the remainder keyword
(The function is called on a pickled sklearn pipeline that I had saved in GCP Storage.)
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-192-c6a8bc0ab221> in <module>
----> 1 safety_project_lite(request)
<ipython-input-190-24c565131f14> in safety_project_lite(request)
31
32 df_resp = pd.DataFrame(data=request_data)
---> 33 response = loaded_model.predict(df_resp)
34
35 output = {"Safety Rating": response[0]}
~/.local/lib/python3.5/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~/.local/lib/python3.5/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
417 Xt = X
418 for _, name, transform in self._iter(with_final=False):
--> 419 Xt = transform.transform(Xt)
420 return self.steps[-1][-1].predict(Xt, **predict_params)
421
~/.local/lib/python3.5/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
581 if (n_cols_transform >= n_cols_fit and
582 any(X.columns[:n_cols_fit] != self._df_columns)):
--> 583 raise ValueError('Column ordering must be equal for fit '
584 'and for transform when using the '
585 'remainder keyword')
ValueError: Column ordering must be equal for fit and for transform when using the remainder keyword
Code:
def safety_project_lite_beta(request):
client = storage.Client(request.GCP_Project)
bucket = client.get_bucket(request.GCP_Bucket)
blob = bucket.blob(request.GCP_Path)
model_file = BytesIO()
blob.download_to_file(model_file)
loaded_model = pickle.loads(model_file.getvalue())
request_data = {'A': [request.A],
'B': [request.B],
'C': [request.C],
'D': [request.D],
'E': [request.E],
'F': [request.F]}
df_resp = pd.DataFrame(data=request_data)
response = loaded_model.predict(df_resp)
output = {"Rating": response[0]}
return output
The model can only predict if the data you feed it is of the same structure as it has been trained on.
To force the fact that df_resp has the same columns as X_train, pass a list of its columns along when building the dataframe:
df_resp = pd.DataFrame(request_data, columns=X_train.columns)
If that variable is for some reason not available, you could pickle its column list (X_train.columns) and use it later:
loaded_cols = pickle.loads([...])
df_resp = pd.DataFrame(data=request_data, columns=loaded_cols)
This ensures a more dynamic workflow where you could add columns more easily for example.

torchtext field with values converted to ids got error integer is required

I followed this tutorial
http://www.programmersought.com/article/2609385756/
to create a TabularDataset with data that already tokenized and converted to ids and I do not want to use vocab or build vocab because the data is numerical
so I defined my field variable as:
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True)
train,val, test = data.TabularDataset.splits(path='./', train=train_path, validation=valid_path, test=test_path ,format='csv', fields=data_fields, skip_header=True)
train output:
print(vars(train[0])['src'])
#output this [101, 3177, 3702, 11293, 1116, 102]
and I used a BucketIterator:
train_iter= BucketIterator(train,
batch_size=BATCH_SIZE,
device = DEVICE,
sort_key=lambda x: (len(x.src), len(x.trg)),
train=True,
batch_size_fn=batch_size_fn,
repeat=False)
when I run this code:
batch = next(iter(train_iter))
I got
TypeError: an integer is required (got type list)
TypeError Traceback (most recent call
last) in ()
----> 1 batch = next(iter(train_iter))
3 frames
/usr/local/lib/python3.6/dist-packages/torchtext/data/iterator.py in
iter(self)
155 else:
156 minibatch.sort(key=self.sort_key, reverse=True)
--> 157 yield Batch(minibatch, self.dataset, self.device)
158 if not self.repeat:
159 return
/usr/local/lib/python3.6/dist-packages/torchtext/data/batch.py in
init(self, data, dataset, device)
32 if field is not None:
33 batch = [getattr(x, name) for x in data]
---> 34 setattr(self, name, field.process(batch, device=device))
35
36 #classmethod
/usr/local/lib/python3.6/dist-packages/torchtext/data/field.py in
process(self, batch, device)
199 """
200 padded = self.pad(batch)
--> 201 tensor = self.numericalize(padded, device=device)
202 return tensor
203
/usr/local/lib/python3.6/dist-packages/torchtext/data/field.py in
numericalize(self, arr, device)
321 arr = self.postprocessing(arr, None)
322
--> 323 var = torch.tensor(arr, dtype=self.dtype, device=device)
324
325 if self.sequential and not self.batch_first:
TypeError: an integer is required (got type list)
You have to provide the pad_token while declaring the Field.
Change this
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True)
to
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True, pad_token=0)

Pyspark ml can't fit the model and always "AttributeError: 'PipelinedRDD' object has no attribute '_jdf'

data = sqlContext.sql("select a.churn,b.pay_amount,c.all_balance from db_bi.t_cust_churn a left join db_bi.t_cust_pay b on a.cust_id=b.cust_id left join db_bi.t_cust_balance c on a.cust_id=c.cust_id limit 5000").cache()
def labelData(df):
return df.map(lambda row: LabeledPoint(row[0], row[1:]))
traindata = labelData(data) --this step works well.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(lrdata)
lrModel = lr.fit(lrdata)
AttributeError Traceback (most recent call last)
<ipython-input-40-b84a106121e6> in <module>()
----> 1 lrModel = lr.fit(lrdata)
/home/hadoop/spark/python/pyspark/ml/pipeline.pyc in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/home/hadoop/spark/python/pyspark/ml/wrapper.pyc in _fit(self, dataset)
131
132 def _fit(self, dataset):
--> 133 java_model = self._fit_java(dataset)
134 return self._create_model(java_model)
135
/home/hadoop/spark/python/pyspark/ml/wrapper.pyc in _fit_java(self, dataset)
128 """
129 self._transfer_params_to_java()
--> 130 return self._java_obj.fit(dataset._jdf)
131
132 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
I guess you are using the tutorial for the latest spark version (2.0.1) with
pyspark.ml.classification import LogisticRegression whereas you need some other version, e.g. 1.6.2 with pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel. Note the different libraries.

'PipelinedRDD' object has no attribute '_jdf'

It's my first post on stakcoverflow because I don't find any clue to solve this message "'PipelinedRDD' object has no attribute '_jdf'" that appear when I call trainer.fit on my train dataset to create a neural network model under Spark in Python
here is my code
from pyspark import SparkContext
from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
### Import data in Spark ###
RDD_RAWfileWH= sc.textFile("c:/Anaconda2/Cognet/Data_For_Cognet_ready.csv")
header = RDD_RAWfileWH.first()
# Delete header from RAWData
RDD_RAWfile1 = RDD_RAWfileWH.filter(lambda x: x != header)
# Split each line of the RDD
RDD_RAWfile = RDD_RAWfile1.map(lambda line:[float(x) for x in line.split(',')])
FinalData = RDD_RAWfile.map(lambda row: LabeledPoint(row[0],[row[1:]]))
(trainingData, testData) = FinalData.randomSplit([0.7, 0.3])
layers = [15, 2, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
# train the model
model = trainer.fit(trainingData)
and the trace
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-28-123dce2b085a> in <module>()
46 trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
47 # train the model
---> 48 model = trainer.fit(trainingData)
49 # compute accuracy on the test set
50 # result = model.transform(test)
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\pipeline.pyc in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit(self, dataset)
131
132 def _fit(self, dataset):
--> 133 java_model = self._fit_java(dataset)
134 return self._create_model(java_model)
135
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit_java(self, dataset)
128 """
129 self._transfer_params_to_java()
--> 130 return self._java_obj.fit(dataset._jdf)
131
132 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
I'am not an expert on Spark so If anyone know what is this jdf attribute and how to solve this issue it will be very helpfull for me.
thanks a lot

Categories

Resources