pyLDAvis: validation error when visualize topic - python

I tried generating topics using gensim for 20000 records. On trying to visualize the topics, I get a validation error. I can print the topics after model training, but not using pyLDAvis.
corpus = descriptions_lem_stop
dic=gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 30, id2word = dic, passes = 10, workers = 2)
lda_model.show_topics()
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
vis
I got the error:
/opt/anaconda3/lib/python3.8/site-packages/pyLDAvis/gensim.py in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
117 """
118 opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
--> 119 return vis_prepare(**opts)
/opt/anaconda3/lib/python3.8/site-packages/pyLDAvis/_prepare.py in prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R, lambda_step, mds, n_jobs, plot_opts, sort_topics)
372 doc_lengths = _series_with_name(doc_lengths, 'doc_length')
373 vocab = _series_with_name(vocab, 'vocab')
--> 374 _input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency)
375 R = min(R, len(vocab))
376
/opt/anaconda3/lib/python3.8/site-packages/pyLDAvis/_prepare.py in _input_validate(*args)
63 res = _input_check(*args)
64 if res:
---> 65 raise ValidationError('\n' + '\n'.join([' * ' + s for s in res]))
66
67
ValidationError:
* Not all rows (distributions) in topic_term_dists sum to 1.

Related

Fake News Detection AttributeError: 'numpy.int64' object has no attribute 'lower'

def fetchData(fileName,modelObj):
data = pd.read_csv('C:/Users/Owner/Desktop/Project/Datasets/Data.csv')
print ("Enter the size of data to train and test: ")
dataSize = input()
data=data.loc[:dataSize]
trainDataSize=int(abs(float(dataSize) * 0.8))
testStartIndex=int(trainDataSize)
testEndIndex=int(dataSize)
#fetching data text feature from data set for training
X_train=data.iloc[:trainDataSize,2].values
#fetching real or fake feature from data set for training
y_train=data.iloc[:trainDataSize,-1].values
#fetching data text feature from data set for testing
X_test=data.iloc[testStartIndex:testEndIndex,2].values
#fetching data text feature from data set for testing
y_test=data.iloc[testStartIndex:testEndIndex,-1].values
print ("The data split is as follows:")
print ("X-train :",len(X_train))
print ("Y-train :",len(y_train))
print ("X-test :",len(X_test))
print ("Y-test :",len(y_test))
'''fetch stop words list from nltk '''
stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))]
#print stopwords_
'''Optimization of feature generation based on Model'''
if modelObj.__class__.__name__!='GridSearchCV':
maxFeatures=50000
else:
maxFeatures=10000
''' intiallize tfidf object '''
''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast,
stop_words are removed by initializing the param stop_words using a
stop words list fetched using NLTK lib }'''
tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
''' Generate TF-IDF Feature for train and test data'''
tfidfTrain = tfidf.fit_transform(X_train)
tfidfTest= tfidf.transform(X_test)
Traceback for the error,
AttributeError Traceback (most recent call last)
<ipython-input-6-28e9ec41b050> in <module>
8 if __name__ == '__main__':
9 print ("Welcome to Fake News Detection")
---> 10 selectTasks()
<ipython-input-5-4497d6866537> in selectTasks()
27 else:
28 print ("Classification on "+MODEL[x])
---> 29 runModel(options[x](PARAMS[x]))
30
<ipython-input-3-1e5fd0540fe3> in runModel(modelObj)
3 #fileName=input()
4 ''' fetch the data split '''
----> 5 X_train,y_train,X_test,y_test=fetchData('C:/Users/Owner/Desktop/Project/Datasets/Data.csv',modelObj)
6 Visualize.plotInitalData(X_train,y_train)
7 ''' fit the Train data '''
<ipython-input-2-116c6a1f9b37> in fetchData(fileName, modelObj)
35 tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
36 ''' Generate TF-IDF Feature for train and test data'''
---> 37 tfidfTrain = tfidf.fit_transform(X_train)
38 tfidfTest= tfidf.transform(X_test)
39
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1844 """
1845 self._check_params()
-> 1846 X = super().fit_transform(raw_documents)
1847 self._tfidf.fit(X)
1848 # X is already a transformed view of raw_documents so
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.int64' object has no attribute 'lower'
I am getting this error and am not able to debug it. Please help. I tried converting the vectors into arrays using tfidfTrain = tfidf.fit_transform(X_train).toarray() and also tfidfTest= tfidf.transform(X_test).toarray(), but it is giving me the same error. I am unable to understand what should I do?

TextBlob error: too many values to unpack

I am trying to run the following code, but I have gotten an error that are too many values to unpack
The code is:
import csv
import json
import pandas as pd
df = pd.read_csv("job/my_data_frame_test.csv", encoding="utf-8")
df.info()
print(df)
TEXT
text recommended
ABC
yes
DEF
no
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(df)
After running this code, I have the following error (in full)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-3d683b8c482a> in <module>
----> 1 cl = NaiveBayesClassifier(df)
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
203 def __init__(self, train_set,
204 feature_extractor=basic_extractor, format=None, **kwargs):
--> 205 super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
206 self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
207
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
137 else: # train_set is a list of tuples
138 self.train_set = train_set
--> 139 self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words.
140 self.train_features = None
141
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in _get_words_from_dataset(dataset)
61 return words
62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
---> 63 return set(all_words)
64
65 def _get_document_tokens(document):
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in <genexpr>(.0)
60 else:
61 return words
---> 62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
63 return set(all_words)
64
ValueError: too many values to unpack (expected 2)
NaiveBayesClassifier() expects a list of tuples of the form (text, label):
train = list(zip(df['TEXT'], df['text recommended']))
# [('ABC', 'yes'), ('DEF', 'no')]
cl = NaiveBayesClassifier(train)
# <NaiveBayesClassifier trained on 2 instances>

Custom Multiple Input Primitive Bug returns "TypeError: issubclass() arg 1 must be a class"

I am using Featuretools library to try to generate custom features involving customer transactions. I tested the function and it returns the answer so I am not sure why I am getting this error.
I tried using the following link:
https://featuretools.alteryx.com/en/stable/getting_started/primitives.html
Thank you!
from featuretools.primitives import make_agg_primitive
from featuretools.variable_types import DatetimeTimeIndex, Numeric, Categorical
def test_fun(categorical, datetimeindex):
x = pd.DataFrame({'store_name': categorical, 'session_start_time': datetimeindex})
x_mode = list(x['store_name'].mode())[0]
x = x[x['store_name'] == x_mode]
y = x.session_start_time.diff().fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')
return y.median()
Test_Fun = make_agg_primitive(function = test_fun,
input_types = [Categorical, DatetimeTimeIndex],
return_type = [Numeric])
fm, fd = ft.dfs(
entityset = es,
target_entity = 'customers',
agg_primitives = [Test_Fun],
cutoff_time = lt,
cutoff_time_in_index = True,
include_cutoff_time = False,
verbose = True,
)
Results in the following error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-492-358f980bb6b0> in <module>
20 return_type = [Numeric])
21
---> 22 fm, fd = ft.dfs(
23 entityset = es,
24 target_entity = 'customers',
~\Anaconda3\lib\site-packages\featuretools\utils\entry_point.py in function_wrapper(*args, **kwargs)
38 ep.on_error(error=e,
39 runtime=runtime)
---> 40 raise e
41
42 # send return value
~\Anaconda3\lib\site-packages\featuretools\utils\entry_point.py in function_wrapper(*args, **kwargs)
30 # call function
31 start = time.time()
---> 32 return_value = func(*args, **kwargs)
33 runtime = time.time() - start
34 except Exception as e:
~\Anaconda3\lib\site-packages\featuretools\synthesis\dfs.py in dfs(entities, relationships, entityset, target_entity, cutoff_time, instance_ids, agg_primitives, trans_primitives, groupby_trans_primitives, allowed_paths, max_depth, ignore_entities, ignore_variables, primitive_options, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, chunk_size, n_jobs, dask_kwargs, verbose, return_variable_types, progress_callback, include_cutoff_time)
259 seed_features=seed_features)
260
--> 261 features = dfs_object.build_features(
262 verbose=verbose, return_variable_types=return_variable_types)
263
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in build_features(self, return_variable_types, verbose)
287 assert isinstance(return_variable_types, list), msg
288
--> 289 self._run_dfs(self.es[self.target_entity_id], RelationshipPath([]),
290 all_features, max_depth=self.max_depth)
291
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _run_dfs(self, entity, relationship_path, all_features, max_depth)
412 """
413
--> 414 self._build_transform_features(all_features, entity, max_depth=max_depth)
415
416 """
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _build_transform_features(self, all_features, entity, max_depth, require_direct_input)
576 input_types = input_types[0]
577
--> 578 matching_inputs = self._get_matching_inputs(all_features,
579 entity,
580 new_max_depth,
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _get_matching_inputs(self, all_features, entity, max_depth, input_types, primitive, primitive_options, require_direct_input, feature_filter)
793 primitive, primitive_options, require_direct_input=False,
794 feature_filter=None):
--> 795 features = self._features_by_type(all_features=all_features,
796 entity=entity,
797 max_depth=max_depth,
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in _features_by_type(self, all_features, entity, max_depth, variable_type)
768 if (variable_type == variable_types.PandasTypes._all or
769 f.variable_type == variable_type or
--> 770 any(issubclass(f.variable_type, vt) for vt in variable_type)):
771 if max_depth is None or f.get_depth(stop_at=self.seed_features) <= max_depth:
772 selected_features.append(f)
~\Anaconda3\lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py in <genexpr>(.0)
768 if (variable_type == variable_types.PandasTypes._all or
769 f.variable_type == variable_type or
--> 770 any(issubclass(f.variable_type, vt) for vt in variable_type)):
771 if max_depth is None or f.get_depth(stop_at=self.seed_features) <= max_depth:
772 selected_features.append(f)
TypeError: issubclass() arg 1 must be a class
I think I figured it out. If there exists a better way, please let me know!
I'm not sure why the approach in the documentation didn't work (it uses functions instead of classes and made no mention of classes).
I was able to leverage the solution from this question to solve the problem:
How to get an item's group mean but exclude the item itself?
from featuretools.primitives import AggregationPrimitive
class Test_Fun(AggregationPrimitive):
name = "test_fun"
input_types = [Categorical, DatetimeTimeIndex]
return_type = Numeric
stack_on_self = False
def get_function(self):
def mean_excluding_value(categorical, datetimeindex):
x = pd.DataFrame({'store_name': categorical, 'session_start_time': datetimeindex})
x_mode = list(x['store_name'].mode())[0]
x = x[x['store_name'] == x_mode]
y = x.session_start_time.diff().fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')
return y.median()
return mean_excluding_value
fm, fd = ft.dfs(
entityset = es,
target_entity = 'customers',
agg_primitives = [Test_Fun],
cutoff_time = lt,
cutoff_time_in_index = True,
include_cutoff_time = False,
verbose = True,
)
In this section of the code:
Test_Fun = make_agg_primitive(function = test_fun,
input_types = [Categorical, DatetimeTimeIndex],
return_type = [Numeric])
return_type should be set to Numeric instead of [Numeric]
This code worked for me:
Test_Fun = make_agg_primitive(function = test_fun,
input_types = [Categorical, DatetimeTimeIndex],
return_type = Numeric)

statsmodel GLM fit_constrained - object of type 'int' has no len()

I'm using statsmodel to fit a GLM (Negative Binomial) and I want to add some constraints to the fit
In the regression, there are five parameters involved: a,b,c,d and the constraint consists in a+b=0
The relevant code is the following:
model = sm.GLM(y_metrics, X, family=sm.families.NegativeBinomial(alpha=1))
result = model.fit_constrained("a + b = 0")
but when I run the code I have the following error:
44
45 model = sm.GLM(y_metrics, X, family=sm.families.NegativeBinomial(alpha=1))
---> 46 result = mod.fit_constrained("a + b = 0")
47
48
~/opt/anaconda3/lib/python3.7/site-packages/statsmodels/genmod/generalized_linear_model.py in fit_constrained(self, constraints, start_params, **fit_kwds)
1345
1346 # same pattern as in base.LikelihoodModel.t_test
-> 1347 lc = DesignInfo(self.exog_names).linear_constraint(constraints)
1348 R, q = lc.coefs, lc.constants
1349
~/opt/anaconda3/lib/python3.7/site-packages/patsy/design_info.py in linear_constraint(self, constraint_likes)
534 di.linear_constraint("x1 = x2 = 3")
535 """
--> 536 return linear_constraint(constraint_likes, self.column_names)
537
538 def describe(self):
~/opt/anaconda3/lib/python3.7/site-packages/patsy/constraint.py in linear_constraint(constraint_like, variable_names)
401 if not isinstance(code, str):
402 raise ValueError("expected a string, not %r" % (code,))
--> 403 tree = parse_constraint(code, variable_names)
404 evaluator = _EvalConstraint(variable_names)
405 constraints.append(evaluator.eval(tree, constraint=True))
~/opt/anaconda3/lib/python3.7/site-packages/patsy/constraint.py in parse_constraint(string, variable_names)
235
236 def parse_constraint(string, variable_names):
--> 237 return infix_parse(_tokenize_constraint(string, variable_names),
238 _ops, _atomic)
239
~/opt/anaconda3/lib/python3.7/site-packages/patsy/constraint.py in _tokenize_constraint(string, variable_names)
177
178 # Prefer long matches:
--> 179 variable_names = sorted(variable_names, key=len, reverse=True)
180 variable_re = "|".join([re.escape(n) for n in variable_names])
181
TypeError: object of type 'int' has no len()
Any idea why it is happening?
Thanks in advance

AttributeError: 'str' object has no attribute 'name' in Tensorflow

I am trying to predict the prices of items using Dnnregressor and I couldn't figure out this error that keeps coming. I created tf numeric and categorical columns from pandas dataframe and fed it into the DNNRegressor. There is not much help online regarding this particular error.
Please help me fix this error. Thanks
AttributeError Traceback (most recent call last)
<ipython-input-27-790ecef8c709> in <module>()
92
93 if __name__ == '__main__':
---> 94 main()
<ipython-input-27-790ecef8c709> in main()
81 # learning_rate=0.1, l1_regularization_strength=0.001))
82 est = tf.estimator.DNNRegressor(feature_columns = feature_columns, hidden_units = [10, 10], model_dir = 'data')
---> 83 est.train(input_fn = get_train_input_fn(Xtrain, ytrain), steps = 500)
84 scores = est.evaluate(input_fn = get_test_input_fn(Xtest, ytest))
85 print('Loss Score: {0:f}' .format(scores['average_loss']))
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps)
239 hooks.append(training.StopAtStepHook(steps, max_steps))
240
--> 241 loss = self._train_model(input_fn=input_fn, hooks=hooks)
242 logging.info('Loss for final step: %s.', loss)
243 return self
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks)
628 input_fn, model_fn_lib.ModeKeys.TRAIN)
629 estimator_spec = self._call_model_fn(features, labels,
--> 630 model_fn_lib.ModeKeys.TRAIN)
631 ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
632 all_hooks.extend(hooks)
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode)
613 if 'config' in model_fn_args:
614 kwargs['config'] = self.config
--> 615 model_fn_results = self._model_fn(features=features, **kwargs)
616
617 if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _model_fn(features, labels, mode, config)
389 dropout=dropout,
390 input_layer_partitioner=input_layer_partitioner,
--> 391 config=config)
392 super(DNNRegressor, self).__init__(
393 model_fn=_model_fn, model_dir=model_dir, config=config)
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py in _dnn_model_fn(features, labels, mode, head, hidden_units, feature_columns, optimizer, activation_fn, dropout, input_layer_partitioner, config)
100 net = feature_column_lib.input_layer(
101 features=features,
--> 102 feature_columns=feature_columns)
103
104 for layer_id, num_hidden_units in enumerate(hidden_units):
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in input_layer(features, feature_columns, weight_collections, trainable)
205 ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
206 """
--> 207 _check_feature_columns(feature_columns)
208 for column in feature_columns:
209 if not isinstance(column, _DenseColumn):
C:\Users\user\Anaconda3\lib\site- packages\tensorflow\python\feature_column\feature_column.py in _check_feature_columns(feature_columns)
1660 name_to_column = dict()
1661 for column in feature_columns:
-> 1662 if column.name in name_to_column:
1663 raise ValueError('Duplicate feature column name found for columns: {} '
1664 'and {}. This usually means that these columns refer to '
C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in name(self)
2451 #property
2452 def name(self):
-> 2453 return '{}_indicator'.format(self.categorical_column.name)
2454
2455 def _transform_feature(self, inputs):
AttributeError: 'str' object has no attribute 'name'
And below is code:
def get_train_input_fn(Xtrain, ytrain):
return tf.estimator.inputs.pandas_input_fn(
x = Xtrain,
y = ytrain,
batch_size = 30,
num_epochs = None,
shuffle = True)
def get_test_input_fn(Xtest, ytest):
return tf.estimator.inputs.pandas_input_fn(
x = Xtest,
y = ytest,
batch_size = 32,
num_epochs = 1,
shuffle = False)
def main():
Xtrain, Xtest, ytrain, ytest = train_test_split(merc, ytr, test_size = 0.4, random_state = 42)
feature_columns = []
brand_rating = tf.feature_column.numeric_column('brand_rating')
feature_columns.append(brand_rating)
sentiment = tf.feature_column.numeric_column('description_polarity')
feature_columns.append(sentiment)
item_condition = tf.feature_column.numeric_column('item_condition_id')
feature_columns.append(item_condition)
shipping = tf.feature_column.indicator_column('shipping')
feature_columns.append(shipping)
name = tf.feature_column.embedding_column('item_name', 34) #(column name, dimension(no. of unique values ** 0.25))
feature_columns.append(name)
general = tf.feature_column.categorical_column_with_hash_bucket('General', 12)
feature_columns.append(general)
sc1 = tf.feature_column.categorical_column_with_hash_bucket('SC1', 120)
feature_columns.append(sc1)
sc2 = tf.feature_column.categorical_column_with_hash_bucket('SC2', 900)
feature_columns.append(sc2)
print(feature_columns)
#est = tf.estimator.DNNRegressor(feature_columns, hidden_units = [10, 10], optimizer=tf.train.ProximalAdagradOptimizer(
# learning_rate=0.1, l1_regularization_strength=0.001))
est = tf.estimator.DNNRegressor(feature_columns = feature_columns, hidden_units = [10, 10], model_dir = 'data')
est.train(input_fn = get_train_input_fn(Xtrain, ytrain), steps = 500)
The first argument to tf.feature_column.embedding_column must be a categorical column, not a string. See API spec.
The offending line in your code is:
tf.feature_column.embedding_column('item_name', 34)
After using
general = tf.feature_column.categorical_column_with_hash_bucket('General', 12)
and other feature_column.categorical_column_with..., you should use
general_indicator = tf.feature_column.indicator_column(general)
and then append it to your feature_columns list.
feature_columns.append(general_indicator)

Categories

Resources