def fetchData(fileName,modelObj):
data = pd.read_csv('C:/Users/Owner/Desktop/Project/Datasets/Data.csv')
print ("Enter the size of data to train and test: ")
dataSize = input()
data=data.loc[:dataSize]
trainDataSize=int(abs(float(dataSize) * 0.8))
testStartIndex=int(trainDataSize)
testEndIndex=int(dataSize)
#fetching data text feature from data set for training
X_train=data.iloc[:trainDataSize,2].values
#fetching real or fake feature from data set for training
y_train=data.iloc[:trainDataSize,-1].values
#fetching data text feature from data set for testing
X_test=data.iloc[testStartIndex:testEndIndex,2].values
#fetching data text feature from data set for testing
y_test=data.iloc[testStartIndex:testEndIndex,-1].values
print ("The data split is as follows:")
print ("X-train :",len(X_train))
print ("Y-train :",len(y_train))
print ("X-test :",len(X_test))
print ("Y-test :",len(y_test))
'''fetch stop words list from nltk '''
stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))]
#print stopwords_
'''Optimization of feature generation based on Model'''
if modelObj.__class__.__name__!='GridSearchCV':
maxFeatures=50000
else:
maxFeatures=10000
''' intiallize tfidf object '''
''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast,
stop_words are removed by initializing the param stop_words using a
stop words list fetched using NLTK lib }'''
tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
''' Generate TF-IDF Feature for train and test data'''
tfidfTrain = tfidf.fit_transform(X_train)
tfidfTest= tfidf.transform(X_test)
Traceback for the error,
AttributeError Traceback (most recent call last)
<ipython-input-6-28e9ec41b050> in <module>
8 if __name__ == '__main__':
9 print ("Welcome to Fake News Detection")
---> 10 selectTasks()
<ipython-input-5-4497d6866537> in selectTasks()
27 else:
28 print ("Classification on "+MODEL[x])
---> 29 runModel(options[x](PARAMS[x]))
30
<ipython-input-3-1e5fd0540fe3> in runModel(modelObj)
3 #fileName=input()
4 ''' fetch the data split '''
----> 5 X_train,y_train,X_test,y_test=fetchData('C:/Users/Owner/Desktop/Project/Datasets/Data.csv',modelObj)
6 Visualize.plotInitalData(X_train,y_train)
7 ''' fit the Train data '''
<ipython-input-2-116c6a1f9b37> in fetchData(fileName, modelObj)
35 tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
36 ''' Generate TF-IDF Feature for train and test data'''
---> 37 tfidfTrain = tfidf.fit_transform(X_train)
38 tfidfTest= tfidf.transform(X_test)
39
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1844 """
1845 self._check_params()
-> 1846 X = super().fit_transform(raw_documents)
1847 self._tfidf.fit(X)
1848 # X is already a transformed view of raw_documents so
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.int64' object has no attribute 'lower'
I am getting this error and am not able to debug it. Please help. I tried converting the vectors into arrays using tfidfTrain = tfidf.fit_transform(X_train).toarray() and also tfidfTest= tfidf.transform(X_test).toarray(), but it is giving me the same error. I am unable to understand what should I do?
Related
I'm building a sentiment analyzer. I built a model that successfully predicts the sentiment of texts, but I can't figure out how to save my entire model with pickle. I can save clf, but I can't save the vectorizer correctly.
In the function trainModel, I return featuresTrain and save it after training my model. After loading both files I run predict(), which gives the error mentioned in the title after it runs vectorizer.transform(). I thought featuresTrain contained the fitted vocabulary, so I'm confused. Any insights?
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
def trainModel(df, category, quantity):
df = pd.read_csv('/Users/NOT/Desktop/VSA/datasets/cleanedData.csv')
train = df.sample(frac=0.8)
test = pd.concat([df,train]).drop_duplicates(keep=False)
featuresTrain = vectorizer.fit_transform(train[category].values.astype('U'))
featuresTest = vectorizer.transform(test[category].values.astype('U'))
trainLabels = [2 if sentiment==4 else 1 if sentiment==2 else 0 for sentiment in train[quantity]]
testLabels = [2 if sentiment==4 else 1 if sentiment==2 else 0 for sentiment in test[quantity]]
clf = sklearn.linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 100000)
clf.fit(featuresTrain, trainLabels)
return clf, featuresTrain
model, vector = trainModel(data, 'tweet', 'sentiment')
def predict(modelName, text):
vec = vectorizer.transform([text])
prediction = list(modelName.predict(vec))[0]
probs = modelName.predict_proba(vec)
if probs[0][0] <= .6 and probs[0][2] <= .6:
prediction = 1
return prediction
filenameP = '/Users/NOT/Desktop/VSA/SMmodel/sentimentAnalysisModel_pkl'
filenameVP = '/Users/NOT/Desktop/VSA/SMmodel/sentimentAnalysisVectorizer_pkl'
pickle.dump(model, open(filenameP, 'wb'))
pickle.dump(vector, open(filenameVP, "wb"))
LM = pickle.load(open(filenameP, 'rb'))
LVM = pickle.load(open(filenameVP, 'rb'))
sentiment = predict(LM, transcript)
Error Traceback:
---------------------------------------------------------------------------
NotFittedError Traceback (most recent call last)
<ipython-input-33-f35d352629a9> in <module>
1 LM = pickle.load(open(filenameP, 'rb'))
2 LVM = pickle.load(open(filenameVP, 'rb'))
----> 3 sentiment = predict(LM, transcript)
<ipython-input-30-f97b97e7bbd5> in predict(modelName, text)
8 def predict(modelName, text):
9
---> 10 vec = vectorizer.transform([text])
11 prediction = list(modelName.predict(vec))[0]
12 probs = modelName.predict_proba(vec)
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in transform(self, raw_documents)
1250 "Iterable over raw text documents expected, "
1251 "string object received.")
-> 1252 self._check_vocabulary()
1253
1254 # use the same matrix-building strategy as fit_transform
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _check_vocabulary(self)
470 self._validate_vocabulary()
471 if not self.fixed_vocabulary_:
--> 472 raise NotFittedError("Vocabulary not fitted or provided")
473
474 if len(self.vocabulary_) == 0:
NotFittedError: Vocabulary not fitted or provided
I tried generating topics using gensim for 20000 records. On trying to visualize the topics, I get a validation error. I can print the topics after model training, but not using pyLDAvis.
corpus = descriptions_lem_stop
dic=gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 30, id2word = dic, passes = 10, workers = 2)
lda_model.show_topics()
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
vis
I got the error:
/opt/anaconda3/lib/python3.8/site-packages/pyLDAvis/gensim.py in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
117 """
118 opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
--> 119 return vis_prepare(**opts)
/opt/anaconda3/lib/python3.8/site-packages/pyLDAvis/_prepare.py in prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R, lambda_step, mds, n_jobs, plot_opts, sort_topics)
372 doc_lengths = _series_with_name(doc_lengths, 'doc_length')
373 vocab = _series_with_name(vocab, 'vocab')
--> 374 _input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency)
375 R = min(R, len(vocab))
376
/opt/anaconda3/lib/python3.8/site-packages/pyLDAvis/_prepare.py in _input_validate(*args)
63 res = _input_check(*args)
64 if res:
---> 65 raise ValidationError('\n' + '\n'.join([' * ' + s for s in res]))
66
67
ValidationError:
* Not all rows (distributions) in topic_term_dists sum to 1.
I am trying to run the following code, but I have gotten an error that are too many values to unpack
The code is:
import csv
import json
import pandas as pd
df = pd.read_csv("job/my_data_frame_test.csv", encoding="utf-8")
df.info()
print(df)
TEXT
text recommended
ABC
yes
DEF
no
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(df)
After running this code, I have the following error (in full)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-3d683b8c482a> in <module>
----> 1 cl = NaiveBayesClassifier(df)
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
203 def __init__(self, train_set,
204 feature_extractor=basic_extractor, format=None, **kwargs):
--> 205 super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
206 self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
207
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
137 else: # train_set is a list of tuples
138 self.train_set = train_set
--> 139 self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words.
140 self.train_features = None
141
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in _get_words_from_dataset(dataset)
61 return words
62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
---> 63 return set(all_words)
64
65 def _get_document_tokens(document):
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in <genexpr>(.0)
60 else:
61 return words
---> 62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
63 return set(all_words)
64
ValueError: too many values to unpack (expected 2)
NaiveBayesClassifier() expects a list of tuples of the form (text, label):
train = list(zip(df['TEXT'], df['text recommended']))
# [('ABC', 'yes'), ('DEF', 'no')]
cl = NaiveBayesClassifier(train)
# <NaiveBayesClassifier trained on 2 instances>
It's not a new question, references I found without any solution working for me first and second.
I'm a newbie to PyTorch, facing AttributeError: 'Field' object has no attribute 'vocab' while creating batches of the text data in PyTorch using torchtext.
Following up the book Deep Learning with PyTorch I wrote the same example as explained in the book.
Here's the snippet:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
TEXT = data.Field(lower=True, batch_first=True, fix_length=20)
LABEL = data.Field(sequential=False)
train, test = datasets.IMDB.splits(TEXT, LABEL)
print("train.fields:", train.fields)
print()
print(vars(train[0])) # prints the object
TEXT.build_vocab(train, vectors=GloVe(name="6B", dim=300),
max_size=10000, min_freq=10)
# VOCABULARY
# print(TEXT.vocab.freqs) # freq
# print(TEXT.vocab.vectors) # vectors
# print(TEXT.vocab.stoi) # Index
train_iter, test_iter = data.BucketIterator.splits(
(train, test), batch_size=128, device=-1, shuffle=True, repeat=False) # -1 for cpu, None for gpu
# Not working (FROM BOOK)
# batch = next(iter(train_iter))
# print(batch.text)
# print()
# print(batch.label)
# This also not working (FROM Second solution)
for i in train_iter:
print (i.text)
print (i.label)
Here's the stacktrace:
AttributeError Traceback (most recent call last)
<ipython-input-33-433ec3a2ca3c> in <module>()
7
8
----> 9 for i in train_iter:
10 print (i.text)
11 print (i.label)
/anaconda3/lib/python3.6/site-packages/torchtext/data/iterator.py in __iter__(self)
155 else:
156 minibatch.sort(key=self.sort_key, reverse=True)
--> 157 yield Batch(minibatch, self.dataset, self.device)
158 if not self.repeat:
159 return
/anaconda3/lib/python3.6/site-packages/torchtext/data/batch.py in __init__(self, data, dataset, device)
32 if field is not None:
33 batch = [getattr(x, name) for x in data]
---> 34 setattr(self, name, field.process(batch, device=device))
35
36 #classmethod
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in process(self, batch, device)
199 """
200 padded = self.pad(batch)
--> 201 tensor = self.numericalize(padded, device=device)
202 return tensor
203
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in numericalize(self, arr, device)
300 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
301 else:
--> 302 arr = [self.vocab.stoi[x] for x in arr]
303
304 if self.postprocessing is not None:
/anaconda3/lib/python3.6/site-packages/torchtext/data/field.py in <listcomp>(.0)
300 arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
301 else:
--> 302 arr = [self.vocab.stoi[x] for x in arr]
303
304 if self.postprocessing is not None:
AttributeError: 'Field' object has no attribute 'vocab'
If not using BucketIterator, what else I can use to get a similar
output?
You haven't built the vocab for the LABEL field.
After TEXT.build_vocab(train, ...), run LABEL.build_vocab(train), and the rest will run.
It's my first post on stakcoverflow because I don't find any clue to solve this message "'PipelinedRDD' object has no attribute '_jdf'" that appear when I call trainer.fit on my train dataset to create a neural network model under Spark in Python
here is my code
from pyspark import SparkContext
from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
### Import data in Spark ###
RDD_RAWfileWH= sc.textFile("c:/Anaconda2/Cognet/Data_For_Cognet_ready.csv")
header = RDD_RAWfileWH.first()
# Delete header from RAWData
RDD_RAWfile1 = RDD_RAWfileWH.filter(lambda x: x != header)
# Split each line of the RDD
RDD_RAWfile = RDD_RAWfile1.map(lambda line:[float(x) for x in line.split(',')])
FinalData = RDD_RAWfile.map(lambda row: LabeledPoint(row[0],[row[1:]]))
(trainingData, testData) = FinalData.randomSplit([0.7, 0.3])
layers = [15, 2, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
# train the model
model = trainer.fit(trainingData)
and the trace
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-28-123dce2b085a> in <module>()
46 trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128,seed=1234)
47 # train the model
---> 48 model = trainer.fit(trainingData)
49 # compute accuracy on the test set
50 # result = model.transform(test)
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\pipeline.pyc in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit(self, dataset)
131
132 def _fit(self, dataset):
--> 133 java_model = self._fit_java(dataset)
134 return self._create_model(java_model)
135
C:\Users\piod7321\spark-1.6.1-bin-hadoop2.6\python\pyspark\ml\wrapper.pyc in _fit_java(self, dataset)
128 """
129 self._transfer_params_to_java()
--> 130 return self._java_obj.fit(dataset._jdf)
131
132 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
I'am not an expert on Spark so If anyone know what is this jdf attribute and how to solve this issue it will be very helpfull for me.
thanks a lot