I am trying to extract data from a txt file with numpy and I get this big error:
Traceback (most recent call last):
File "C:\Python36\machine learning\bayes_classifier.py", line 14, in
<module>
data = np.loadtxt(input_file, delimiter=",")
File "C:\Python36\lib\site-packages\numpy\lib\npyio.py", line 1092, in loadtxt
for x in read_data(_loadtxt_chunksize):
File "C:\Python36\lib\site-packages\numpy\lib\npyio.py", line 1019, in read_data
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "C:\Python36\lib\site-packages\numpy\lib\npyio.py", line 1019, in <listcomp>
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "C:\Python36\lib\site-packages\numpy\lib\npyio.py", line 738, in floatconv
return float(x)
ValueError: could not convert string to float: '2.18'
notice the ValueError, what is the '2.18'?
Here is the full code:
### Naïve bayes classifier ###
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation
from utilities import visualize_classifier
# Input file containing data
input_file = "data_multivar_nb.txt"
# Load data from input file
data = np.loadtxt(input_file, delimiter=",")
x, y = data[:, :-1], data[:, -1]
and here is the file:
file link here
Related
Recently I have been working on a code and got stuck for days on this error. Basically the program plots a 3D colormap from csv file. I am using Python 3 with anaconda3.
https://drive.google.com/drive/folders/1hfL_TbfWwD6uZCgxiOa-xjWT1ChL2PUs?usp=sharing
This is the code:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
DataX_Y_1D = np.loadtxt("datacsv_1d_xy.csv", delimiter=",")
X, Y = np.meshgrid(DataX_Y_1D[:,0], DataX_Y_1D[:,1])
Z = np.loadtxt("datacsv_2d_Z.csv", delimiter=",")
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, Z)
plt.show()
The problem like this:
File "/Users/lenguyen/Desktop/test_3D/plot_3D_4.py", line 1, in <module>
DataAll1D = np.loadtxt("datacsv_1d.csv", delimiter=",")
NameError: name 'np' is not defined
(base) lenguyen#ntmle2 test_3D % python plot_3D_4.py
Traceback (most recent call last):
File "/Users/lenguyen/Desktop/test_3D/plot_3D_4.py", line 5, in <module>
DataAll1D = np.loadtxt("datacsv_1d.csv", delimiter=",")
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 1148, in loadtxt
for x in read_data(_loadtxt_chunksize):
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in read_data
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in <listcomp>
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 736, in floatconv
return float(x)
ValueError: could not convert string to float: '\ufeff9.9'
(base) lenguyen#ntmle2 test_3D % python plot_3D_4.py
Traceback (most recent call last):
File "/Users/lenguyen/Desktop/test_3D/plot_3D_4.py", line 5, in <module>
DataAll1D = np.loadtxt("datacsv_1d.csv", delimiter=",")
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 1148, in loadtxt
for x in read_data(_loadtxt_chunksize):
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in read_data
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in <listcomp>
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 736, in floatconv
return float(x)
ValueError: could not convert string to float: '\ufeff9.9'
(base) lenguyen#ntmle2 test_3D % python plot_3D_4.py
Traceback (most recent call last):
File "/Users/lenguyen/Desktop/test_3D/plot_3D_4.py", line 5, in <module>
DataAll1D = np.loadtxt("datacsv_1d.csv", delimiter=",")
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 1148, in loadtxt
for x in read_data(_loadtxt_chunksize):
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in read_data
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in <listcomp>
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 736, in floatconv
return float(x)
ValueError: could not convert string to float: '\ufeff9.9'
(base) lenguyen#ntmle2 test_3D % python plot_3D_4.py
Traceback (most recent call last):
File "/Users/lenguyen/Desktop/test_3D/plot_3D_4.py", line 5, in <module>
DataAll1D = np.loadtxt("datacsv_1d.csv", dtype= "float", delimiter=",")
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 1148, in loadtxt
for x in read_data(_loadtxt_chunksize):
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in read_data
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 999, in <listcomp>
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "/opt/anaconda3/lib/python3.9/site-packages/numpy/lib/npyio.py", line 736, in floatconv
return float(x)
ValueError: could not convert string to float: '\ufeff10'
you need to define np
import numpy as np
I was able to import your file with:
x,y = np.genfromtxt('test.csv', delimiter=',', unpack=True, skip_header=0)
Loading z data:
z_all = np.genfromtxt('datacsv_2d_z.csv', delimiter=',', unpack=True, skip_header=0)
A quick plot:
plt.imshow(z_all)
Gives:
3D colormap graph:
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, figsize=(10, 10) )
X, Y = np.genfromtxt('datacsv_1d_xy.csv', delimiter=',', unpack=True, skip_header=0)
X, Y = np.meshgrid(X, Y)
Z = np.genfromtxt('datacsv_2d_z.csv', delimiter=',', unpack=True, skip_header=0)
# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
linewidth=0, antialiased=False)
When I tried to take the word_vector transformed from Chinese as the feature of sklearn,an error occurred.
The shape of x_train and word_vector are (747,) and (1,100) and the latter's dtype is float64
for this question, I guess the type of the data may be different, but i tried to traverse all the data, it was ok ……
Here are the code:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
import SZ_function as sz
import gensim
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
def remove_stop_words(text):
stop_words = sz.get_step_words('notebook/HIT.txt')
text = text.split()
word_list = ''
for word in text:
if word not in stop_words:
word_list += word
word_list += ' '
return word_list
def pre_process(path):
data = pd.read_excel(path)
data['text'] = data['text'].apply(sz.remove_number_en)
data['text'] = data['text'].apply(sz.cut_words)
data['text'] = data['text'].apply(remove_stop_words)
data = data.replace(to_replace='', value='None')
data = data.replace(to_replace='None', value=np.nan).dropna()
return data
def create_corpus(data):
text = data['text']
return [sentences.split() for sentences in text]
def word_vec(corpus):
model = gensim.models.word2vec.Word2Vec(corpus)
return model
def get_sent_vec(sent,model,size):
vec = np.zeros(size).reshape((1,size))
count = 0
for word in sent[1:]:
try:
vec += model.wv[word].reshape((1,size))
count += 1
except:
continue
if count != 0:
vec /= count
return vec
if __name__ == '__main__':
data = pre_process('datasets_demo.xlsx')
corpus = create_corpus(data)
model = word_vec(corpus)
data['text']=data['text'].apply(get_sent_vec,model=model,size=100)
x_train,y_train,x_test,y_test = train_test_split(data['text'],data['label'])
estimator = MultinomialNB()
estimator.fit(x_train,y_train)
here are the all trackback:
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\12996\AppData\Local\Temp\jieba.cache
Loading model cost 0.628 seconds.
Prefix dict has been built successfully.
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-8366eff678ac>", line 1, in <module>
runfile('C:/Users/12996/Desktop/Tensorflow_/datasets_demo.py', wdir='C:/Users/12996/Desktop/Tensorflow_')
File "E:\pycharm\PyCharm 2022.1\plugins\python\helpers\pydev\_pydev_bundle\pydev_umd.py", line 198, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "E:\pycharm\PyCharm 2022.1\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/12996/Desktop/Tensorflow_/datasets_demo.py", line 66, in <module>
estimator.fit(x_train,y_train)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\naive_bayes.py", line 663, in fit
X, y = self._check_X_y(X, y)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\naive_bayes.py", line 523, in _check_X_y
return self._validate_data(X, y, accept_sparse="csr", reset=reset)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\base.py", line 581, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\utils\validation.py", line 976, in check_X_y
estimator=estimator,
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\utils\validation.py", line 746, in check_array
array = np.asarray(array, order=order, dtype=dtype)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\pandas\core\series.py", line 857, in __array__
return np.asarray(self._values, dtype)
ValueError: setting an array element with a sequence.
I am trying to predict toxic comments using Toxic Comment data from kaggle:
import skmultilearn, sys
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
y = csr_matrix(data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']])
binary_rel_clf = BinaryRelevance(MultinomialNB())
binary_rel_clf.fit(Xfeatures,y)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [y.columns.values[prediction].tolist() for prediction in br_prediction]
print(predictions)
However, I got this error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\base\base.py", line 86, in _ensure_input_format
return X.toarray()
File "...\scipy\sparse\compressed.py", line 1031, in toarray
out = self._process_toarray_args(order, out)
File "...\scipy\sparse\base.py", line 1202, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 226. GiB for an array with shape (159571, 189775) and data type float64
And even if try to pass the param "require_dense=False" I got another error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\skmultilearn\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\sklearn\naive_bayes.py", line 612, in fit
X, y = self._check_X_y(X, y)
File "...\sklearn\naive_bayes.py", line 477, in _check_X_y
return self._validate_data(X, y, accept_sparse='csr')
File "...\sklearn\base.py", line 433, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 826, in check_X_y
y = column_or_1d(y, warn=True)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 864, in column_or_1d
raise ValueError(
ValueError: y should be a 1d array, got an array of shape () instead.
How can I fix that and train using the entire model?
It seems that you specified the required_dense argument incorrectly. You need required_dense=[False, True] in order to specify the X values in sparse format but not the y values. In the second last row (predictions = ...) you need to use y before you convert it to a matrix so you can access the column names.
The following code should work.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
import numpy as np
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
cats = data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
y = csr_matrix(cats)
binary_rel_clf = BinaryRelevance(MultinomialNB(), require_dense = [False, True])
binary_rel_clf.fit(Xfeatures, y) # y[:,0].toarray().reshape(-1)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [cats.columns[prediction].tolist() for prediction in br_prediction]
print(predictions)
Output:
[['toxic', 'obscene', 'insult']]
I keep getting an error using the numpy loadtxt converter.
Your help is greatly appreciated
import numpy as np
import time
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.dates as mdates
from matplotlib.finance import candlestick
from matplotlib.dates import strpdate2num
import urllib2
## global variables
eachStock = 'AAPL','GOOG','MSFT','AMZN','CMG'
for stock in eachStock:
stockFile = stock+'.txt'
date, closep, highp, lowp, openp, volume = np.loadtxt(eachStock, delimiter=',', unpack=True,
converters={ 0: mdates.strpdate2num('%Y%m%d')})
dFrame = Series(closep)
here is the first line in my text file
20040322,13.5200,13.6800,12.6100,12.6850,15850720
here is the error I keep getting
Traceback (most recent call last):
File "C:\Users\antoniozeus\Desktop\BuyAndHold.py", line 27, in <module>
converters={ 0: mdates.strpdate2num('%Y%m%d')})
File "C:\Python27\lib\site-packages\numpy\lib\npyio.py", line 796, in loadtxt
items = [conv(val) for (conv, val) in zip(converters, vals)]
File "C:\Python27\lib\site-packages\matplotlib\dates.py", line 233, in __call__
return date2num(datetime.datetime(*time.strptime(s, self.fmt)[:6]))
File "C:\Python27\lib\_strptime.py", line 454, in _strptime_time
return _strptime(data_string, format)[0]
File "C:\Python27\lib\_strptime.py", line 325, in _strptime
(data_string, format))
ValueError: time data 'AAPL' does not match format '%Y%m%d'
It seems like you mistyped stockFile (filename) as eachStock.
date, closep, highp, lowp, openp, volume = np.loadtxt(
stockFile, delimiter=',', unpack=True,
converters={ 0: mdates.strpdate2num('%Y%m%d')})
I wrote a Python script (below) which load data from a text file (using pandas) and checks the values in the columns.
import sys
import pandas as pd
import numpy as np
from numpy import ndarray
import math
import matplotlib.pyplot as plt
from matplotlib.pyplot import *
from skimage import data
from skimage.feature import match_template
if __name__ == '__main__':
data = pd.read_csv('Fe_PSI_spt_refined.txt', sep=" ", header = None)
data.columns = ["Angle_number", "Omega", "Intensity", "X", "Y", "Address", "ID"]#, "flag"]
Number_of_projections = 181
Number_of_lines_in_txt = 3493
numrows = len(data)
counter_array = []
correlation_threshold_value = 0.7
a = np.zeros(Number_of_lines_in_txt)
output_file = ("output.txt")
for i in range(2, (Number_of_projections + 1)):
filename_cutouts_combined = ("cutouts_combined_%03i.txt" % (i))
filename_cutouts_combined_tag = ("cutouts_combined_tag_%03i.txt" % (i))
image = np.loadtxt(filename_cutouts_combined)
image_tagged = np.loadtxt(filename_cutouts_combined_tag)
for j in range(0, Number_of_lines_in_txt - 1):
print data.Angle_number[j], i
After one iteration of j I get the error below. Do you spot any error I should fix? Thanks
`Traceback (most recent call last):
File "Hyperbola_search.py", line 46, in <module>
print data.Angle_number[j], i
File "/Users/Alberto/anaconda/lib/python2.7/site-packages/pandas/core/series.py", line 491, in __getitem__
result = self.index.get_value(self, key)
File "/Users/Alberto/anaconda/lib/python2.7/site-packages/pandas/core/index.py", line 1032, in get_value
return self._engine.get_value(s, k)
File "index.pyx", line 97, in pandas.index.IndexEngine.get_value (pandas/index.c:2661)
File "index.pyx", line 105, in pandas.index.IndexEngine.get_value (pandas/index.c:2476)
File "index.pyx", line 149, in pandas.index.IndexEngine.get_loc (pandas/index.c:3215)
File "hashtable.pyx", line 382, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6450)
File "hashtable.pyx", line 388, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6394)
KeyError: 3491`
You load files into image and image_tagged, while a remains unused.
I don't know what data.Angle_number and numrows are, but they appear to be from libraries, not related to your files.