I am trying to run the following code, but I have gotten an error that are too many values to unpack
The code is:
import csv
import json
import pandas as pd
df = pd.read_csv("job/my_data_frame_test.csv", encoding="utf-8")
df.info()
print(df)
TEXT
text recommended
ABC
yes
DEF
no
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(df)
After running this code, I have the following error (in full)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-3d683b8c482a> in <module>
----> 1 cl = NaiveBayesClassifier(df)
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
203 def __init__(self, train_set,
204 feature_extractor=basic_extractor, format=None, **kwargs):
--> 205 super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
206 self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
207
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in __init__(self, train_set, feature_extractor, format, **kwargs)
137 else: # train_set is a list of tuples
138 self.train_set = train_set
--> 139 self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words.
140 self.train_features = None
141
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in _get_words_from_dataset(dataset)
61 return words
62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
---> 63 return set(all_words)
64
65 def _get_document_tokens(document):
/usr/local/lib/python3.8/dist-packages/textblob/classifiers.py in <genexpr>(.0)
60 else:
61 return words
---> 62 all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
63 return set(all_words)
64
ValueError: too many values to unpack (expected 2)
NaiveBayesClassifier() expects a list of tuples of the form (text, label):
train = list(zip(df['TEXT'], df['text recommended']))
# [('ABC', 'yes'), ('DEF', 'no')]
cl = NaiveBayesClassifier(train)
# <NaiveBayesClassifier trained on 2 instances>
Related
def fetchData(fileName,modelObj):
data = pd.read_csv('C:/Users/Owner/Desktop/Project/Datasets/Data.csv')
print ("Enter the size of data to train and test: ")
dataSize = input()
data=data.loc[:dataSize]
trainDataSize=int(abs(float(dataSize) * 0.8))
testStartIndex=int(trainDataSize)
testEndIndex=int(dataSize)
#fetching data text feature from data set for training
X_train=data.iloc[:trainDataSize,2].values
#fetching real or fake feature from data set for training
y_train=data.iloc[:trainDataSize,-1].values
#fetching data text feature from data set for testing
X_test=data.iloc[testStartIndex:testEndIndex,2].values
#fetching data text feature from data set for testing
y_test=data.iloc[testStartIndex:testEndIndex,-1].values
print ("The data split is as follows:")
print ("X-train :",len(X_train))
print ("Y-train :",len(y_train))
print ("X-test :",len(X_test))
print ("Y-test :",len(y_test))
'''fetch stop words list from nltk '''
stopwords_=[word.encode('utf-8')for word in list(stopwords.words('english'))]
#print stopwords_
'''Optimization of feature generation based on Model'''
if modelObj.__class__.__name__!='GridSearchCV':
maxFeatures=50000
else:
maxFeatures=10000
''' intiallize tfidf object '''
''' feature generation -> tfidf { parameters max_features set to a fixed number to produce results fast,
stop_words are removed by initializing the param stop_words using a
stop words list fetched using NLTK lib }'''
tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
''' Generate TF-IDF Feature for train and test data'''
tfidfTrain = tfidf.fit_transform(X_train)
tfidfTest= tfidf.transform(X_test)
Traceback for the error,
AttributeError Traceback (most recent call last)
<ipython-input-6-28e9ec41b050> in <module>
8 if __name__ == '__main__':
9 print ("Welcome to Fake News Detection")
---> 10 selectTasks()
<ipython-input-5-4497d6866537> in selectTasks()
27 else:
28 print ("Classification on "+MODEL[x])
---> 29 runModel(options[x](PARAMS[x]))
30
<ipython-input-3-1e5fd0540fe3> in runModel(modelObj)
3 #fileName=input()
4 ''' fetch the data split '''
----> 5 X_train,y_train,X_test,y_test=fetchData('C:/Users/Owner/Desktop/Project/Datasets/Data.csv',modelObj)
6 Visualize.plotInitalData(X_train,y_train)
7 ''' fit the Train data '''
<ipython-input-2-116c6a1f9b37> in fetchData(fileName, modelObj)
35 tfidf = TfidfVectorizer(min_df = 1, max_features = maxFeatures, stop_words=stopwords_)
36 ''' Generate TF-IDF Feature for train and test data'''
---> 37 tfidfTrain = tfidf.fit_transform(X_train)
38 tfidfTest= tfidf.transform(X_test)
39
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1844 """
1845 self._check_params()
-> 1846 X = super().fit_transform(raw_documents)
1847 self._tfidf.fit(X)
1848 # X is already a transformed view of raw_documents so
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
c:\python39\lib\site-packages\sklearn\feature_extraction\text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.int64' object has no attribute 'lower'
I am getting this error and am not able to debug it. Please help. I tried converting the vectors into arrays using tfidfTrain = tfidf.fit_transform(X_train).toarray() and also tfidfTest= tfidf.transform(X_test).toarray(), but it is giving me the same error. I am unable to understand what should I do?
Can you please help me to with the following function where I got the error of ValueError: Column ordering must be equal for fit and for transform when using the remainder keyword
(The function is called on a pickled sklearn pipeline that I had saved in GCP Storage.)
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-192-c6a8bc0ab221> in <module>
----> 1 safety_project_lite(request)
<ipython-input-190-24c565131f14> in safety_project_lite(request)
31
32 df_resp = pd.DataFrame(data=request_data)
---> 33 response = loaded_model.predict(df_resp)
34
35 output = {"Safety Rating": response[0]}
~/.local/lib/python3.5/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~/.local/lib/python3.5/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
417 Xt = X
418 for _, name, transform in self._iter(with_final=False):
--> 419 Xt = transform.transform(Xt)
420 return self.steps[-1][-1].predict(Xt, **predict_params)
421
~/.local/lib/python3.5/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
581 if (n_cols_transform >= n_cols_fit and
582 any(X.columns[:n_cols_fit] != self._df_columns)):
--> 583 raise ValueError('Column ordering must be equal for fit '
584 'and for transform when using the '
585 'remainder keyword')
ValueError: Column ordering must be equal for fit and for transform when using the remainder keyword
Code:
def safety_project_lite_beta(request):
client = storage.Client(request.GCP_Project)
bucket = client.get_bucket(request.GCP_Bucket)
blob = bucket.blob(request.GCP_Path)
model_file = BytesIO()
blob.download_to_file(model_file)
loaded_model = pickle.loads(model_file.getvalue())
request_data = {'A': [request.A],
'B': [request.B],
'C': [request.C],
'D': [request.D],
'E': [request.E],
'F': [request.F]}
df_resp = pd.DataFrame(data=request_data)
response = loaded_model.predict(df_resp)
output = {"Rating": response[0]}
return output
The model can only predict if the data you feed it is of the same structure as it has been trained on.
To force the fact that df_resp has the same columns as X_train, pass a list of its columns along when building the dataframe:
df_resp = pd.DataFrame(request_data, columns=X_train.columns)
If that variable is for some reason not available, you could pickle its column list (X_train.columns) and use it later:
loaded_cols = pickle.loads([...])
df_resp = pd.DataFrame(data=request_data, columns=loaded_cols)
This ensures a more dynamic workflow where you could add columns more easily for example.
I'm trying to follow the example on this notebook.
As suggested in this github thread:
I've upped the ulimit to 9999.
I've already converted the csv files to hdf5
My code fails when trying to open a single hdf5 file into a dataframe:
df = vaex.open('data/chat_history_00.hdf5')
Here's the rest of the code:
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('data/*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
assert len(hdf5_list) == 11, "Incorrect number of files"
# Check how the single file looks like:
df = vaex.open('data/chat_history_10.hdf5')
df
Error generated:
ERROR:MainThread:vaex:error opening 'data/chat_history_00.hdf5'
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in
1 # Check how the single file looks like:
----> 2 df = vaex.open('data/chat_history_10.hdf5')
3 df
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/init.py in
open(path, convert, shuffle, copy_index, *args, **kwargs)
207 ds = from_csv(path, copy_index=copy_index, **kwargs)
208 else:
--> 209 ds = vaex.file.open(path, *args, **kwargs)
210 if convert and ds:
211 ds.export_hdf5(filename_hdf5, shuffle=shuffle)
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/file/init.py
in open(path, *args, **kwargs)
39 break
40 if dataset_class:
---> 41 dataset = dataset_class(path, *args, **kwargs)
42 return dataset
43
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py
in init(self, filename, write)
84 self.h5table_root_name = None
85 self._version = 1
---> 86 self._load()
87
88 def write_meta(self):
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py
in _load(self)
182 def _load(self):
183 if "data" in self.h5file:
--> 184 self._load_columns(self.h5file["/data"])
185 self.h5table_root_name = "/data"
186 if "table" in self.h5file:
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/hdf5/dataset.py
in _load_columns(self, h5data, first)
348 self.add_column(column_name, self._map_hdf5_array(data, column['mask']))
349 else:
--> 350 self.add_column(column_name, self._map_hdf5_array(data))
351 else:
352 transposed = shape1 < shape[0]
/usr/local/anaconda3/lib/python3.7/site-packages/vaex/dataframe.py in
add_column(self, name, f_or_array, dtype) 2929
if len(self) == len(ar): 2930 raise
ValueError("Array is of length %s, while the length of the DataFrame
is %s due to the filtering, the (unfiltered) length is %s." %
(len(ar), len(self), self.length_unfiltered()))
-> 2931 raise ValueError("array is of length %s, while the length of the DataFrame is %s" % (len(ar),
self.length_original())) 2932 # assert
self.length_unfiltered() == len(data), "columns should be of equal
length, length should be %d, while it is %d" % (
self.length_unfiltered(), len(data)) 2933 valid_name =
vaex.utils.find_valid_name(name)
ValueError: array is of length 2578961, while the length of the
DataFrame is 6
What does this mean and how do I troubleshoot it? All the files has 6 columns.
EDIT:
Here's how I created the hdf5 file:
pd.read_csv(r'G:/path/to/file/data/chat_history-00.csv').to_hdf(r'data/chat_history_00.hdf5', key='data')
The question has been answered by Jovan of vaex on Github:
You should not use pandas .to_hdf if you want to read the data with
vaex in a memory-mapped way. Please see this link for more details.
I used this instead:
vdf = vaex.from_pandas(df, copy_index=False)
vdf.export_hdf5('chat_history_00.hdf5')
I have 64 bit windows 10 OS
I have installed python 3.6.8
I have installed torch and torchtext using pip.
torch version is 1.2.0
I am trying to load AG_NEWS dataset using below code:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('./.data'):
os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./.data', ngrams=NGRAMS, vocab=None)
On the last statement of above code, I am getting below error:
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-1-7e8544fdaaf6> in <module>
6 if not os.path.isdir('./.data'):
7 os.mkdir('./.data')
----> 8 train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./.data', ngrams=NGRAMS, vocab=None)
9 # BATCH_SIZE = 16
10 # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in AG_NEWS(*args, **kwargs)
168 """
169
--> 170 return _setup_datasets(*(("AG_NEWS",) + args), **kwargs)
171
172
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in _setup_datasets(dataset_name, root, ngrams, vocab, include_unk)
126 if vocab is None:
127 logging.info('Building Vocab based on {}'.format(train_csv_path))
--> 128 vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
129 else:
130 if not isinstance(vocab, Vocab):
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\vocab.py in build_vocab_from_iterator(iterator)
555 counter = Counter()
556 with tqdm(unit_scale=0, unit='lines') as t:
--> 557 for tokens in iterator:
558 counter.update(tokens)
559 t.update(1)
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in _csv_iterator(data_path, ngrams, yield_cls)
33 with io.open(data_path, encoding="utf8") as f:
34 reader = unicode_csv_reader(f)
---> 35 for row in reader:
36 tokens = ' '.join(row[1:])
37 tokens = tokenizer(tokens)
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\utils.py in unicode_csv_reader(unicode_csv_data, **kwargs)
128 maxInt = int(maxInt / 10)
129
--> 130 csv.field_size_limit(sys.maxsize)
131
132 if six.PY2:
OverflowError: Python int too large to convert to C long
I think the issue is with either windows os or torchtext because I am getting same error for below code as well.
pos = data.TabularDataset( path='data/pos/pos_wsj_train.tsv', format='tsv', fields=[('text', data.Field()),
('labels', data.Field())])
Can somebody please help? and mainly I don't have any large numerical values in the file.
I also encountered a similar problem. I changed a line of code in my torchtext\utils.py file and my error disappeared.
Changed this:
csv.field_size_limit(sys.maxsize)
To this:
csv.field_size_limit(maxInt)
I am trying to fit a sample probablistic graphical model using sample data.
While fitting the data in model, I am encountering type error. Sample code is given below:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
data = np.random.uniform(low=0, high=2, size=(1000, 4)).astype('float')
data
data = pd.DataFrame(data, columns=['cost', 'quality',
'location',
'no_of_people'])
train = data[:750]
test = data[750:].drop('no_of_people', axis=1)
restaurant_model = BayesianModel(
[('location', 'cost'),
('quality', 'cost'),
('location', 'no_of_people'),
('cost', 'no_of_people')])
restaurant_model.fit(train)
I am encountering the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-173-8e3a85cb8b56> in <module>()
----> 1 restaurant_model.fit(train)
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\models\BayesianModel.py in fit(self, data, estimator_type)
568 estimator = estimator_type(self, data)
569
--> 570 cpds_list = estimator.get_parameters()
571 self.add_cpds(*cpds_list)
572
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\estimators\MLE.py in get_parameters(self)
64 state_counts = state_counts.reindex(sorted(state_counts.index))
65 cpd = TabularCPD(node, self.node_card[node],
---> 66 state_counts.values[:, np.newaxis])
67 cpd.normalize()
68 parameters.append(cpd)
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\factors\CPD.py in __init__(self, variable, variable_card, values, evidence, evidence_card)
137 raise TypeError("Values must be a 2D list/array")
138
--> 139 super(TabularCPD, self).__init__(variables, cardinality, values.flatten('C'))
140
141 def __repr__(self):
C:\Users\pranav.waila\AppData\Local\Continuum\Anaconda3\lib\site-packages\pgmpy\factors\Factor.py in __init__(self, variables, cardinality, values)
98
99 if values.dtype != int and values.dtype != float:
--> 100 raise TypeError("Values: Expected type int or type float, got ", values.dtype)
101
102 if len(cardinality) != len(variables):
TypeError: ('Values: Expected type int or type float, got ', dtype('int64'))