It's not a new question, references I found without any solution working for me first and second.
I'm a newbie to PyTorch, facing AttributeError: 'Field' object has no attribute 'vocab' while creating batches of the text data in PyTorch using torchtext.
Following up the book Deep Learning with PyTorch I wrote the same example as explained in the book.
Here's the snippet:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
TEXT = data.Field(lower=True, batch_first=True, fix_length=20)
LABEL = data.Field(sequential=False)
train, test = datasets.IMDB.splits(TEXT, LABEL)
print("train.fields:", train.fields)
print()
print(vars(train[0])) # prints the object
TEXT.build_vocab(train, vectors=GloVe(name="6B", dim=300),
max_size=10000, min_freq=10)
# VOCABULARY
# print(TEXT.vocab.freqs) # freq
# print(TEXT.vocab.vectors) # vectors
# print(TEXT.vocab.stoi) # Index
train_iter, test_iter = data.BucketIterator.splits(
(train, test), batch_size=128, device=-1, shuffle=True, repeat=False) # -1 for cpu, None for gpu
# Not working (FROM BOOK)
# batch = next(iter(train_iter))
# print(batch.text)
# print()
# print(batch.label)
# This also not working (FROM Second solution)
for i in train_iter:
print (i.text)
print (i.label)
Here's the stacktrace:
AttributeError Traceback (most recent call last)
<ipython-input-33-433ec3a2ca3c> in <module>()
7
8
----> 9 for i in train_iter:
10 print (i.text)
11 print (i.label)
/anaconda3/lib/python3.6/site-packages/torchtext/data/iterator.py in __iter__(self)
155 else:
156 minibatch.sort(key=self.sort_key, reverse=True)
--> 157 yield Batch(minibatch, self.dataset, self.device)
158 if not self.repeat:
159 return
/anaconda3/lib/python3.6/site-packages/torchtext/data/batch.py in __init__(self, data, dataset, device)
32 if field is not None:
33 batch = [getattr(x, name) for x in data]
---> 34 setattr(self, name, field.process(batch, device=device))
35
36 #classmethod
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in process(self, batch, device)
199 """
200 padded = self.pad(batch)
--> 201 tensor = self.numericalize(padded, device=device)
202 return tensor
203
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in numericalize(self, arr, device)
300 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
301 else:
--> 302 arr = [self.vocab.stoi[x] for x in arr]
303
304 if self.postprocessing is not None:
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in <listcomp>(.0)
300 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
301 else:
--> 302 arr = [self.vocab.stoi[x] for x in arr]
303
304 if self.postprocessing is not None:
AttributeError: 'Field' object has no attribute 'vocab'
If not using BucketIterator, what else I can use to get a similar
output?
You haven't built the vocab for the LABEL field.
After TEXT.build_vocab(train, ...), run LABEL.build_vocab(train), and the rest will run.
Related
Training MBART in Seq2Seq with SimpleTransformers but getting an error I am not seeing with BART:
TypeError: shift_tokens_right() missing 1 required positional argument: 'decoder_start_token_id'
So far I've tried various combinations of
model.decoder_tokenizer.add_special_tokens({"bos_token": "<s>"})
Which is already set beforehand. Using something other than bos_token indicates that the token is not a special token.
Leaving the following code:
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
# Model Config
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 4 # 64
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False # False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = 10 # 0
model_args.num_return_sequences = 3
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 4 # 8
model_args.use_multiprocessing = False
model_ru = Seq2SeqModel(
encoder_decoder_type="mbart",
encoder_decoder_name="IlyaGusev/mbart_ru_sum_gazeta",
args=model_args,
use_cuda=True
)
# Add custom tokens
model_ru.encoder_tokenizer.add_tokens(["token1", "token2"])
# already set, as seen from: model_ru.decoder_tokenizer.bos_token
model_ru.decoder_tokenizer.add_special_tokens({"bos_token": "<s>"})
model_ru.model.resize_token_embeddings(len(model_ru.encoder_tokenizer))
model_ru.train_model(train, eval_data=dev)
Which throws the following error:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:3407: FutureWarning:
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.
Here is a short example:
model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]
See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.
warnings.warn(formatted_warning, FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_1538/3709317111.py in <module>
15 model_ru.model.resize_token_embeddings(len(model_ru.encoder_tokenizer))
16
---> 17 model_ru.train_model(train_tydiqa_ru, eval_data=dev_tydiqa_ru)
18
19 # Evaluation and training loss can also be found WandB
5 frames
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_model.py in train_model(self, train_data, output_dir, show_running_loss, args, eval_data, verbose, **kwargs)
433 self._move_model_to_device()
434
--> 435 train_dataset = self.load_and_cache_examples(train_data, verbose=verbose)
436
437 os.makedirs(output_dir, exist_ok=True)
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_model.py in load_and_cache_examples(self, data, evaluate, no_cache, verbose, silent)
1489 if args.model_type in ["bart", "mbart", "marian"]:
1490 return SimpleSummarizationDataset(
-> 1491 encoder_tokenizer, self.args, data, mode
1492 )
1493 else:
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in __init__(self, tokenizer, args, data, mode)
423 else:
424 self.examples = [
--> 425 preprocess_fn(d) for d in tqdm(data, disable=args.silent)
426 ]
427
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in <listcomp>(.0)
423 else:
424 self.examples = [
--> 425 preprocess_fn(d) for d in tqdm(data, disable=args.silent)
426 ]
427
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in preprocess_data_mbart(data)
359 decoder_input_ids,
360 tokenizer.pad_token_id,
--> 361 tokenizer.lang_code_to_id[args.tgt_lang],
362 )
363
/usr/local/lib/python3.7/dist-packages/simpletransformers/seq2seq/seq2seq_utils.py in <lambda>(input_ids, pad_token_id, decoder_start_token_id)
30 shift_tokens_right = (
31 lambda input_ids, pad_token_id, decoder_start_token_id: _shift_tokens_right(
---> 32 input_ids, pad_token_id
33 )
34 )
TypeError: shift_tokens_right() missing 1 required positional argument: 'decoder_start_token_id'
def fetchData(fileName,modelObj):
data = pd.read_csv('C:/Users/Owner/Desktop/Project/Datasets/Data.csv')
print ("Enter the size of data to train and test: ")
dataSize = input()
data=data.loc[:dataSize]
trainDataSize=int(abs(float(dataSize) * 0.8))
testStartIndex=int(trainDataSize)
testEndIndex=int(dataSize)
#fetching data text feature from data set for training
X_train=data.iloc[:trainDataSize,2].values
#fetching real or fake feature from data set for training
y_train=data.iloc[:trainDataSize,-1].values
#fetching data text feature from data set for testing
X_test=data.iloc[testStartIndex:testEndIndex,2].values
#fetching data text feature from data set for testing
y_test=data.iloc[testStartIndex:testEndIndex,-1].values
print ("The data split is as follows:")
print ("X-train :",len(X_train))
print ("Y-train :",len(y_train))
print ("X-test :",len(X_test))
print ("Y-test :",len(y_test))
'''fetch stop words list from nltk '''
stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))]
#print stopwords_
'''Optimization of feature generation based on Model'''
if modelObj.__class__.__name__!='GridSearchCV':
maxFeatures=50000
else:
maxFeatures=10000
''' intiallize tfidf object '''
''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast,
stop_words are removed by initializing the param stop_words using a
stop words list fetched using NLTK lib }'''
tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
''' Generate TF-IDF Feature for train and test data'''
tfidfTrain = tfidf.fit_transform(X_train)
tfidfTest= tfidf.transform(X_test)
Traceback for the error,
AttributeError Traceback (most recent call last)
<ipython-input-6-28e9ec41b050> in <module>
8 if __name__ == '__main__':
9 print ("Welcome to Fake News Detection")
---> 10 selectTasks()
<ipython-input-5-4497d6866537> in selectTasks()
27 else:
28 print ("Classification on "+MODEL[x])
---> 29 runModel(options[x](PARAMS[x]))
30
<ipython-input-3-1e5fd0540fe3> in runModel(modelObj)
3 #fileName=input()
4 ''' fetch the data split '''
----> 5 X_train,y_train,X_test,y_test=fetchData('C:/Users/Owner/Desktop/Project/Datasets/Data.csv',modelObj)
6 Visualize.plotInitalData(X_train,y_train)
7 ''' fit the Train data '''
<ipython-input-2-116c6a1f9b37> in fetchData(fileName, modelObj)
35 tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
36 ''' Generate TF-IDF Feature for train and test data'''
---> 37 tfidfTrain = tfidf.fit_transform(X_train)
38 tfidfTest= tfidf.transform(X_test)
39
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1844 """
1845 self._check_params()
-> 1846 X = super().fit_transform(raw_documents)
1847 self._tfidf.fit(X)
1848 # X is already a transformed view of raw_documents so
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.int64' object has no attribute 'lower'
I am getting this error and am not able to debug it. Please help. I tried converting the vectors into arrays using tfidfTrain = tfidf.fit_transform(X_train).toarray() and also tfidfTest= tfidf.transform(X_test).toarray(), but it is giving me the same error. I am unable to understand what should I do?
I followed this tutorial
http://www.programmersought.com/article/2609385756/
to create a TabularDataset with data that already tokenized and converted to ids and I do not want to use vocab or build vocab because the data is numerical
so I defined my field variable as:
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True)
train,val, test = data.TabularDataset.splits(path='./', train=train_path, validation=valid_path, test=test_path ,format='csv', fields=data_fields, skip_header=True)
train output:
print(vars(train[0])['src'])
#output this [101, 3177, 3702, 11293, 1116, 102]
and I used a BucketIterator:
train_iter= BucketIterator(train,
batch_size=BATCH_SIZE,
device = DEVICE,
sort_key=lambda x: (len(x.src), len(x.trg)),
train=True,
batch_size_fn=batch_size_fn,
repeat=False)
when I run this code:
batch = next(iter(train_iter))
I got
TypeError: an integer is required (got type list)
TypeError Traceback (most recent call
last) in ()
----> 1 batch = next(iter(train_iter))
3 frames
/usr/local/lib/python3.6/dist-packages/torchtext/data/iterator.py in
iter(self)
155 else:
156 minibatch.sort(key=self.sort_key, reverse=True)
--> 157 yield Batch(minibatch, self.dataset, self.device)
158 if not self.repeat:
159 return
/usr/local/lib/python3.6/dist-packages/torchtext/data/batch.py in
init(self, data, dataset, device)
32 if field is not None:
33 batch = [getattr(x, name) for x in data]
---> 34 setattr(self, name, field.process(batch, device=device))
35
36 #classmethod
/usr/local/lib/python3.6/dist-packages/torchtext/data/field.py in
process(self, batch, device)
199 """
200 padded = self.pad(batch)
--> 201 tensor = self.numericalize(padded, device=device)
202 return tensor
203
/usr/local/lib/python3.6/dist-packages/torchtext/data/field.py in
numericalize(self, arr, device)
321 arr = self.postprocessing(arr, None)
322
--> 323 var = torch.tensor(arr, dtype=self.dtype, device=device)
324
325 if self.sequential and not self.batch_first:
TypeError: an integer is required (got type list)
You have to provide the pad_token while declaring the Field.
Change this
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True)
to
myField = Field(tokenize= x_tokenize, use_vocab=False, sequential=True, pad_token=0)
I am using keras with Tesorflow backend (using python 2.7) and I built two dense layer on top of pretrained vgg16 model with four classes. And I get a pretty decent results on my validation set. Now I want to use lime to interpret my result.
I import the lime package and transform one of my images following the lime github repo https://github.com/marcotcr/lime/blob/master/doc/notebooks/Tutorial%20-%20Image%20Classification%20Keras.ipynb. My path_list contains one photo.
import lime
from lime import lime_image
def transform_img_fn(path_list):
out = []
for img_path in path_list:
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img) / 255
x = np.expand_dims(x, axis=0)
out.append(x)
return np.vstack(out)
check_image = transform_img_fn(path_list)
Then
check_image[0].shape
OUTPUT: (3, 224, 224)
predictions[0]
OUTPUT: array([9.67346e-01, 3.00240e-03, 2.96037e-02, 4.79915e-05], dtype=float32)
explainer = lime_image.LimeImageExplainer()
explanation = explainer.explain_instance(check_image[0], model_top.predict, hide_color=0, num_samples=100)
I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-115-e286e97e849d> in <module>()
----> 1 explanation = explainer.explain_instance(check_image[0], model_top.predict, hide_color=0, num_samples=100)
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/lime_image.pyc in explain_instance(self, image, classifier_fn, labels, hide_color, top_labels, num_features, num_samples, batch_size, segmentation_fn, distance_metric, model_regressor, random_seed)
165 segmentation_fn = SegmentationAlgorithm('quickshift', kernel_size=4,
166 max_dist=200, ratio=0.2,
--> 167 random_seed=random_seed)
168 try:
169 segments = segmentation_fn(image)
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/wrappers/scikit_image.pyc in __init__(self, algo_type, **target_params)
103 if (self.algo_type == 'quickshift'):
104 BaseWrapper.__init__(self, quickshift, **target_params)
--> 105 kwargs = self.filter_params(quickshift)
106 self.set_params(**kwargs)
107 elif (self.algo_type == 'felzenszwalb'):
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/wrappers/scikit_image.pyc in filter_params(self, fn, override)
82 result = {}
83 for name, value in self.target_params.items():
---> 84 if has_arg(fn, name):
85 result.update({name: value})
86 result.update(override)
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/site-packages/lime/utils/generic_utils.pyc in has_arg(fn, arg_name)
19 else:
20 try:
---> 21 arg_spec = inspect.getargspec(fn.__call__)
22 except AttributeError:
23 return False
/home/ec2-user/anaconda2/envs/env1/lib/python2.7/inspect.pyc in getargspec(func)
813 func = func.im_func
814 if not isfunction(func):
--> 815 raise TypeError('{!r} is not a Python function'.format(func))
816 args, varargs, varkw = getargs(func.func_code)
817 return ArgSpec(args, varargs, varkw, func.func_defaults)
TypeError: <method-wrapper '__call__' of builtin_function_or_method object at 0x7fea20ea4e60> is not a Python function
Based on the documentation, "classifier_fn: function that takes a list of images and returns a matrix of prediction probabilities". I replaced this argument with model_top.predict. I can get all of my predictions if I call predictions = model_top.predict(validation_data, batch_size=32)
Any help would be appreciated.
It's my first post on stakcoverflow because I don't find any clue to solve this message "'PipelinedRDD' object has no attribute '_jdf'" that appear when I call trainer.fit on my train dataset to create a neural network model under Spark in Python
here is my code
from pyspark import SparkContext
from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
### Import data in Spark ###
RDD_RAWfileWH= sc.textFile("c:/Anaconda2/Cognet/Data_For_Cognet_ready.csv")
header = RDD_RAWfileWH.first()
# Delete header from RAWData
RDD_RAWfile1 = RDD_RAWfileWH.filter(lambda x: x != header)
# Split each line of the RDD
RDD_RAWfile = RDD_RAWfile1.map(lambda line:[float(x) for x in line.split(',')])
FinalData = RDD_RAWfile.map(lambda row: LabeledPoint(row[0],[row[1:]]))
(trainingData, testData) = FinalData.randomSplit([0.7, 0.3])
layers = [15, 2, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
# train the model
model = trainer.fit(trainingData)
and the trace
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-28-123dce2b085a> in <module>()
46 trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
47 # train the model
---> 48 model = trainer.fit(trainingData)
49 # compute accuracy on the test set
50 # result = model.transform(test)
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\pipeline.pyc in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit(self, dataset)
131
132 def _fit(self, dataset):
--> 133 java_model = self._fit_java(dataset)
134 return self._create_model(java_model)
135
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit_java(self, dataset)
128 """
129 self._transfer_params_to_java()
--> 130 return self._java_obj.fit(dataset._jdf)
131
132 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
I'am not an expert on Spark so If anyone know what is this jdf attribute and how to solve this issue it will be very helpfull for me.
thanks a lot