ValueError: Object arrays cannot be loaded when allow_pickle=False - python

I tried to get solution for this code , hoping for a positive response
much_data = np.load('muchdata-50-50-20.npy')
output:
ValueError Traceback (most recent call last)
<ipython-input-6-6710fe7f2bb7> in <module>
----> 1 much_data = np.load('muchdata-50-50-20.npy')
~\anaconda3\envs\tf-gpu-cuda8\lib\site-packages\numpy\lib\npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
437 return format.open_memmap(file, mode=mmap_mode)
438 else:
--> 439 return format.read_array(fid, allow_pickle=allow_pickle,
440 pickle_kwargs=pickle_kwargs)
441 else:
~\anaconda3\envs\tf-gpu-cuda8\lib\site-packages\numpy\lib\format.py in read_array(fp, allow_pickle, pickle_kwargs)
725 # The array contained Python objects. We need to unpickle the data.
726 if not allow_pickle:
--> 727 raise ValueError("Object arrays cannot be loaded when "
728 "allow_pickle=False")
729 if pickle_kwargs is None:
ValueError: Object arrays cannot be loaded when allow_pickle=False
Please let me know the solution for this

Try
much_data = np.load('muchdata-50-50-20.npy', allow_pickle=True)

Related

AttributeError: 'IntervalArray' object has no attribute '_dtype' while reading pickle file to dataframe

I want to read a pickle file into a data frame. However, I get the following error message and I don't know how to solve the issue.
df_batches = pd.read_pickle(folder_in / "batches_and_phases.p")
Error Message:
AttributeError Traceback (most recent call last)
File ~\Master_Thesis\mypython\lib\site-packages\pandas\io\pickle.py:205, in read_pickle(filepath_or_buffer, compression, storage_options)
204 warnings.simplefilter("ignore", Warning)
--> 205 return pickle.load(handles.handle)
206 except excs_to_catch:
207 # e.g.
208 # "No module named 'pandas.core.sparse.series'"
209 # "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
File ~\Master_Thesis\mypython\lib\site-packages\pandas\_libs\internals.pyx:750, in pandas._libs.internals.BlockManager.__setstate__()
File ~\Master_Thesis\mypython\lib\site-packages\pandas\_libs\internals.pyx:767, in pandas._libs.internals.BlockManager.__setstate__()
File ~\Master_Thesis\mypython\lib\site-packages\pandas\core\internals\blocks.py:2143, in ensure_block_shape(values, ndim)
2142 if values.ndim < ndim:
-> 2143 if not is_1d_only_ea_dtype(values.dtype):
2144 # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023
2145 # block.shape is incorrect for "2D" ExtensionArrays
2146 # We can't, and don't need to, reshape.
2147 values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values)
File ~\Master_Thesis\mypython\lib\site-packages\pandas\core\arrays\interval.py:624, in IntervalArray.dtype(self)
622 #property
623 def dtype(self) -> IntervalDtype:
--> 624 return self._dtype
AttributeError: 'IntervalArray' object has no attribute '_dtype'
Thank you!

astype(str) in pandas raise TypeError

I have a simple dataframe produced an error
stdcos.head() produce
PartNo Cost
0 180 8.95
1 213 0.32
2 215 2.77
3 216 3.02
4 218 1.37
stdcos.dtypes returns
PartNo object
Cost float64
dtype: object
Why in this case can raise a TypeError for stdcos['PartNo'].astype(str)?
Is it possible to have something to do with the weird PartNo dtype?
Sorry, this is the error message
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-48-791196d10a7a> in <module>
----> 1 stdcos['PartNo'].astype(str)
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2774 if self.columns.nlevels > 1:
2775 return self._getitem_multilevel(key)
-> 2776 return self._get_item_cache(key)
2777
2778 # Do we have a slicer (on rows)?
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
3584 res = cache.get(item)
3585 if res is None:
-> 3586 values = self._data.get(item)
3587 res = self._box_item_values(item, values)
3588 cache[item] = res
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\internals\managers.py in get(self, item)
966 raise ValueError("cannot label index with a null key")
967
--> 968 return self.iget(loc)
969 else:
970
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\internals\managers.py in iget(self, i)
983 Otherwise return as a ndarray
984 """
--> 985 block = self.blocks[self._blknos[i]]
986 values = block.iget(self._blklocs[i])
987
TypeError: only integer scalar arrays can be converted to a scalar index
Thanks to #Juanpa.arrivillaga,
I found the bug.
It comes from stdcos.columns = [['PartNo', 'Cost']]
So the column name is actually nested, but for some reason it display like a flat ones in jupyter notebook.
Changing it back to stdcos.columns = ['PartNo', 'Cost'] fixed the issue.

appending json files in python

I am trying to append some json files in python. I have the following code. It seems right. However, I am getting an error.
The code is as follows.
import pandas as pd
df1=pd.DataFrame()
for i in range(0,49):
df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
df1.append(df.T)
The error is as follows.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-76-ddb355627155> in <module>
3 df1=pd.DataFrame()
4 for i in range(0,49):
----> 5 df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
6 df1.append(df.T)
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression)
590 return json_reader
591
--> 592 result = json_reader.read()
593 if should_close:
594 try:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in read(self)
715 obj = self._get_object_parser(self._combine_lines(data.split("\n")))
716 else:
--> 717 obj = self._get_object_parser(self.data)
718 self.close()
719 return obj
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in _get_object_parser(self, json)
737 obj = None
738 if typ == "frame":
--> 739 obj = FrameParser(json, **kwargs).parse()
740
741 if typ == "series" or obj is None:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in parse(self)
847
848 else:
--> 849 self._parse_no_numpy()
850
851 if self.obj is None:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in _parse_no_numpy(self)
1091 if orient == "columns":
1092 self.obj = DataFrame(
-> 1093 loads(json, precise_float=self.precise_float), dtype=None
1094 )
1095 elif orient == "split":
ValueError: Expected object or value
The code works when I do it for each file individually. Would anyone be able to help me regarding this.
Thanks & Best Regards
Michael
The error occurs on a df = pd.read_json (...) line. It is likely that one of the file is non existent or incorrect. My advice is to use a try catch to identify it:
for i in range(0,49):
try:
df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
except:
print('Error on iteration', i, ', file',
'/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
raise
df1.append(df.T)
Catching any exception is normally bad practice because it can hide truely abnormal conditions like an IO or memory error. That is the reason why I re-raise the original exception in above code.

Word2Vec error when loading in GoogleNews data

I am following a tutorial here: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
I am at the part "Word2vec and Logistic Regression". I have downloaded the "GoogleNews-vectors-negative300.bin.gz" file and I am tyring to apply it to my own text data. However when I get to the following code:
%%time
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("/data/users/USERS/File_path/classifier/GoogleNews_Embedding/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)
I run into the following error:
/data/users/msmith/env/lib64/python3.6/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function
'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
---------------------------------------------------------------------------
EOFError Traceback (most recent call last)
<timed exec> in <module>
~/env/lib64/python3.6/site-packages/gensim/models/keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
1492 return _load_word2vec_format(
1493 cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
-> 1494 limit=limit, datatype=datatype)
1495
1496 def get_keras_embedding(self, train_embeddings=False):
~/env/lib64/python3.6/site-packages/gensim/models/utils_any2vec.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
383 with utils.ignore_deprecation_warning():
384 # TODO use frombuffer or something similar
--> 385 weights = fromstring(fin.read(binary_len), dtype=REAL).astype(datatype)
386 add_word(word, weights)
387 else:
/usr/lib64/python3.6/gzip.py in read(self, size)
274 import errno
275 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
--> 276 return self._buffer.read(size)
277
278 def read1(self, size=-1):
/usr/lib64/python3.6/_compression.py in readinto(self, b)
66 def readinto(self, b):
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data
70 return len(data)
/usr/lib64/python3.6/gzip.py in read(self, size)
480 break
481 if buf == b"":
--> 482 raise EOFError("Compressed file ended before the "
483 "end-of-stream marker was reached")
484
EOFError: Compressed file ended before the end-of-stream marker was reached
Any idea whats gone wrong/ how to overcome this issue?
Thanks in advance!

Joblib error: TypeError: can't pickle _thread.lock objects

I am unable to run joblib using my function which takes a numpy array, list of trained Keras models and a list of strings as parameters.
I tried creating the parameters as a namedtuple or even as a class with immutable properties. Any ideas ?
Params = collections.namedtuple('Params',['inputs','y_list','trained_models'])
p = Params(inputs, y_list, trained_models)
or
class Params:
def __init__(self, inputs, mq_list,trained_models):
super(Params , self).__setattr__("inputs", inputs)
super(Params , self).__setattr__("y_list", y_list)
super(Params , self).__setattr__("trained_models", trained_models)
Function which i like to run in parallel:
def predict(params):
inputs = params.inputs
y_list = params.y_list
trained_models = params.trained_models
# process and vectorize inputs
X= new_X(inputs)
X_vect= vect.transform(X)
predictions = dict()
for y in y_list:
y_field = trained_models[y].predict(X_vect)
# evaluate model
if y_field[0] > 0.05:
return None, None
predictions[y] = y_field[0]
return X, predictions
Parallelized calling of function:
r= Parallel(n_jobs=4, verbose=5)(
delayed(predict)(p)
for c in range(100))
Error:
TypeErrorTraceback (most recent call last) <timed exec> in <module>()
~/.conda/envs/mlgpu/lib/python3.6/site-packages/joblib/parallel.py in
__call__(self, iterable)
787 # consumption.
788 self._iterating = False
--> 789 self.retrieve()
790 # Make sure that we get a last message telling us we are done
791 elapsed_time = time.time() - self._start_time
~/.conda/envs/mlgpu/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
697 try:
698 if getattr(self._backend, 'supports_timeout', False):
--> 699 self._output.extend(job.get(timeout=self.timeout))
700 else:
701 self._output.extend(job.get())
~/.conda/envs/mlgpu/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~/.conda/envs/mlgpu/lib/python3.6/multiprocessing/pool.py in
_handle_tasks(taskqueue, put, outqueue, pool, cache)
422 break
423 try:
--> 424 put(task)
425 except Exception as e:
426 job, idx = task[:2]
~/.conda/envs/mlgpu/lib/python3.6/site-packages/joblib/pool.py in send(obj)
369 def send(obj):
370 buffer = BytesIO()
--> 371 CustomizablePickler(buffer, self._reducers).dump(obj)
372 self._writer.send_bytes(buffer.getvalue())
373 self._send = send
TypeError: can't pickle _thread.lock objects
You should create your own class because you don't know if the function collections.namedtuple has non-pickable parts.
I ran into a similar problem some months ago, I was adding lambda functions to a class to pass it as argument. But since lambda functions are not pickable (by the package pickle) it gives an error.

Categories

Resources