xarray.open_mfdataset "ValueError: axes don't match array"

xarray.open_mfdataset "ValueError: axes don't match array" - python

I'm using python xarray's open_mfdataset to open a dataset that's spread out among multiple HDF5 files. I have 2 datasets that should be nearly identical in structure. The first dataset works just fine, but when I try and open the other, I get the error:
ValueError: axes don't match array
I have no idea what is causing the error, and every google search results in questions about neural networks, which seem to trip the same error but for a different reason.
edit: I should share, the line the error is occurring on looks like this:
df = xr.open_mfdataset("/some/directory/*.h5", concat_dim='TIME')
I'd share the files, but they're quite large and I'm not sure the legalities of doing so, and I'm not sure how to reproduce the problem without them.
edit 2:
I think I found the problem. The data's (x,y,z) dimensions are (300, 300, 60). However, the model I'm using uses the same named dimension for x and y since its the same number of nx and ny (this is very silly imo). So the header of the file states:
-> % ncdump -h icefix-A-2014-02-05-120000-g1.h5
netcdf icefix-A-2014-02-05-120000-g1 {
dimensions:
phony_dim_0 = 300 ;
phony_dim_1 = 60 ;
phony_dim_2 = 2 ;
phony_dim_3 = 5 ;
phony_dim_4 = 11 ;
And a 3D variable looks something like
float CCP(phony_dim_1, phony_dim_0, phony_dim_0) ;
And I think xarray is having problems dealing with a variable having duplicate named dimensions. I'm not sure if there's a way to fix this other than to alter the way the model outputs its files.
The full traceback is as follows:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-cb4a12bd492e> in <module>
----> 1 control_d1 = xr.open_mfdataset(datadir + "feb2014_control/icefix*g1.h5", concat_dim='TIME')
~/anaconda3/lib/python3.7/site-packages/xarray/backends/api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, autoclose, parallel, **kwargs)
717 data_vars=data_vars, coords=coords,
718 infer_order_from_coords=infer_order_from_coords,
--> 719 ids=ids)
720 except ValueError:
721 for ds in datasets:
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_combine(datasets, concat_dims, compat, data_vars, coords, infer_order_from_coords, ids)
551 # Repeatedly concatenate then merge along each dimension
552 combined = _combine_nd(combined_ids, concat_dims, compat=compat,
--> 553 data_vars=data_vars, coords=coords)
554 return combined
555
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _combine_nd(combined_ids, concat_dims, data_vars, coords, compat)
473 data_vars=data_vars,
474 coords=coords,
--> 475 compat=compat)
476 combined_ds = list(combined_ids.values())[0]
477 return combined_ds
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat)
491 datasets = combined_ids.values()
492 new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat,
--> 493 data_vars, coords)
494 return new_combined_ids
495
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_combine_1d(datasets, concat_dim, compat, data_vars, coords)
509 concatenated = [_auto_concat(list(ds_group), dim=dim,
510 data_vars=data_vars, coords=coords)
--> 511 for id, ds_group in grouped_by_vars]
512 else:
513 concatenated = datasets
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in <listcomp>(.0)
509 concatenated = [_auto_concat(list(ds_group), dim=dim,
510 data_vars=data_vars, coords=coords)
--> 511 for id, ds_group in grouped_by_vars]
512 else:
513 concatenated = datasets
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _auto_concat(datasets, dim, data_vars, coords)
367 'explicitly')
368 dim, = concat_dims
--> 369 return concat(datasets, dim=dim, data_vars=data_vars, coords=coords)
370
371
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
118 raise TypeError('can only concatenate xarray Dataset and DataArray '
119 'objects, got %s' % type(first_obj))
--> 120 return f(objs, dim, data_vars, coords, compat, positions)
121
122
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
303 if k in concat_over:
304 vars = ensure_common_dims([ds.variables[k] for ds in datasets])
--> 305 combined = concat_vars(vars, dim, positions)
306 insert_result_variable(k, combined)
307
~/anaconda3/lib/python3.7/site-packages/xarray/core/variable.py in concat(variables, dim, positions, shortcut)
2083 along the given dimension.
2084 """
-> 2085 variables = list(variables)
2086 if all(isinstance(v, IndexVariable) for v in variables):
2087 return IndexVariable.concat(variables, dim, positions, shortcut)
~/anaconda3/lib/python3.7/site-packages/xarray/core/combine.py in ensure_common_dims(vars)
296 common_shape = tuple(non_concat_dims.get(d, dim_len)
297 for d in common_dims)
--> 298 var = var.set_dims(common_dims, common_shape)
299 yield var
300
~/anaconda3/lib/python3.7/site-packages/xarray/core/variable.py in set_dims(self, dims, shape)
1209 expanded_var = Variable(expanded_dims, expanded_data, self._attrs,
1210 self._encoding, fastpath=True)
-> 1211 return expanded_var.transpose(*dims)
1212
1213 def _stack_once(self, dims, new_dim):
~/anaconda3/lib/python3.7/site-packages/xarray/core/variable.py in transpose(self, *dims)
1152 return self.copy(deep=False)
1153
-> 1154 data = as_indexable(self._data).transpose(axes)
1155 return type(self)(dims, data, self._attrs, self._encoding,
1156 fastpath=True)
~/anaconda3/lib/python3.7/site-packages/xarray/core/indexing.py in transpose(self, order)
1210
1211 def transpose(self, order):
-> 1212 return self.array.transpose(order)
1213
1214
~/anaconda3/lib/python3.7/site-packages/dask/array/core.py in transpose(self, *axes)
1331 See Also
1332 --------
-> 1333 da.store
1334 h5py.File.create_dataset
1335 """
~/anaconda3/lib/python3.7/site-packages/dask/array/routines.py in transpose(a, axes)
136
137
--> 138 #derived_from(np)
139 def swapaxes(a, axis1, axis2):
140 if axis1 == axis2:
ValueError: axes don't match array

Related

Numpy command to calculate sine (and cosine) consumes all RAM

I am trying to calculate sine and cosine of month number (e.g. Jan=1, Feb=2, ... Dec=12) for a series of observations that covers ~5 years:
def get_sin(value, max_value):
sine = np.sin(value * (2.*np.pi/max_value))
return sine
def get_cosine(value, max_value):
cosine = np.cos(value * (2.*np.pi/max_value))
return cosine
I run the following command on the data:
df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)
However my desktop RAM is exausted, and then I get the following MemoryError:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
Input In [466], in <cell line: 1>()
----> 1 df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)
File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:8839, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8828 from pandas.core.apply import frame_apply
8830 op = frame_apply(
8831 self,
8832 func=func,
(...)
8837 kwargs=kwargs,
8838 )
-> 8839 return op.apply().__finalize__(self, method="apply")
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:854, in FrameApply.apply_standard(self)
851 results, res_index = self.apply_series_generator()
853 # wrap results
--> 854 return self.wrap_results(results, res_index)
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:880, in FrameApply.wrap_results(self, results, res_index)
878 # see if we can infer the results
879 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 880 return self.wrap_results_for_axis(results, res_index)
882 # dict of scalars
883
884 # the default dtype of an empty Series will be `object`, but this
885 # code can be hit by df.mean() where the result should have dtype
886 # float64 even if it's an empty Series.
887 constructor_sliced = self.obj._constructor_sliced
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1027, in FrameColumnApply.wrap_results_for_axis(self, results, res_index)
1023 result.index = res_index
1025 # we may want to infer results
1026 else:
-> 1027 result = self.infer_to_same_shape(results, res_index)
1029 return result
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1033, in FrameColumnApply.infer_to_same_shape(self, results, res_index)
1031 def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
1032 """infer the results to the same shape as the input object"""
-> 1033 result = self.obj._constructor(data=results)
1034 result = result.T
1036 # set the index
File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:636, in DataFrame.__init__(self, data, index, columns, dtype, copy)
630 mgr = self._init_mgr(
631 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
632 )
634 elif isinstance(data, dict):
635 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
637 elif isinstance(data, ma.MaskedArray):
638 import numpy.ma.mrecords as mrecords
File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:494, in dict_to_mgr(data, index, columns, dtype, typ, copy)
487 arrays = [
488 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
489 ]
491 if copy:
492 # arrays_to_mgr (via form_blocks) won't make copies for EAs
493 # dtype attr check to exclude EADtype-castable strs
--> 494 arrays = [
495 x
496 if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
497 else x.copy()
498 for x in arrays
499 ]
500 # TODO: can we get rid of the dt64tz special case above?
502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:497, in <listcomp>(.0)
487 arrays = [
488 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
489 ]
491 if copy:
492 # arrays_to_mgr (via form_blocks) won't make copies for EAs
493 # dtype attr check to exclude EADtype-castable strs
494 arrays = [
495 x
496 if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
--> 497 else x.copy()
498 for x in arrays
499 ]
500 # TODO: can we get rid of the dt64tz special case above?
502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
File ~\Anaconda3\lib\site-packages\pandas\core\generic.py:6032, in NDFrame.copy(self, deep)
5926 #final
5927 def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
5928 """
5929 Make a copy of this object's indices and data.
5930
(...)
6030 dtype: object
6031 """
-> 6032 data = self._mgr.copy(deep=deep)
6033 self._clear_item_cache()
6034 return self._constructor(data).__finalize__(self, method="copy")
File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:603, in BaseBlockManager.copy(self, deep)
600 else:
601 new_axes = list(self.axes)
--> 603 res = self.apply("copy", deep=deep)
605 res.axes = new_axes
607 if self.ndim > 1:
608 # Avoid needing to re-compute these
File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
302 applied = b.apply(f, **kwargs)
303 else:
--> 304 applied = getattr(b, f)(**kwargs)
305 except (TypeError, NotImplementedError):
306 if not ignore_failures:
File ~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py:643, in Block.copy(self, deep)
641 values = self.values
642 if deep:
--> 643 values = values.copy()
644 return type(self)(values, placement=self._mgr_locs, ndim=self.ndim)
File ~\Anaconda3\lib\site-packages\pandas\core\arrays\masked.py:680, in BaseMaskedArray.copy(self)
678 def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
679 data, mask = self._data, self._mask
--> 680 data = data.copy()
681 mask = mask.copy()
682 return type(self)(data, mask, copy=False)
MemoryError: Unable to allocate 404. KiB for an array with shape (51724,) and data type float64
I suppose there is something very inefficient with my coding. Can anybody suggest what I am doing wrong?
UPDATE:
I noticed something very weird about variable 'month'. I used
df_ufvdate['month'] = df_ufvdate['month'].astype('int64')
to convert 'month' into an integer and when I run df_ufvdate.info(max_cols=250, show_counts='True') I see that 'month' is type 'int64':
month 51724 non-null int64
However, when I run
df_ufvdate['month'].describe()
I get that 'month' is type 'float64':
count 51724.000000
mean 8.030895
std 3.693370
min 1.000000
25% 5.000000
50% 9.000000
75% 11.000000
max 12.000000
Name: month, dtype: float64
Here is more info on df_ufvdate:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51724 entries, 1 to 62618
Data columns (total 211 columns)
dtypes: Int64(34), float64(105), int64(1), object(71)
memory usage: 85.3+ MB
Here is my desktop specs:
Windows 64,
RAM: 24GB,
Jupyter: 6.4.8,
Python 3.9.12 (main, Apr 4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]

I got it fixed:
def get_sin(row, column, max_value):
value = row[column]
sine = np.sin(value * (2.*np.pi/max_value))
return sine
def get_cosine(row, column, max_value):
value = row[column]
cosine = np.cos(value * (2.*np.pi/max_value))
return cosine
and then these lambdas will do the trick:
df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(row, 'month', 12), axis=1)
df_ufvdate['month_cosine'] = df_ufvdate.apply(lambda row: get_cosine(row, 'month', 12), axis=1)
Thank all who commented on this question!

Image processing in Tensor flow TFX pipelines

I am trying to get a Tensorflow TFX pipeline up and running using the MNIST dataset.
# Imports
import pandas as pd
import numpy as np
from keras.datasets import mnist
import tensorflow as tf
from tfx import v1 as tfx
import os
from tfx.components import ImportExampleGen
from platform import python_version
python_version() #'3.8.8'
# Load the data - 60,000 training examples and 10,000 testing examples
(train_x, train_y), (test_x, test_y) = mnist.load_data()
Setup pipeline paths
_pipeline_root = './pipeline'
_data_root = './data'
if not os.path.isdir(_pipeline_root) and not os.path.isdir(_data_root):
!mkdir {_pipeline_root}
!mkdir {_data_root}
Write the data to TF.record format and save in eval and train dirs. NOTE that the MNIST data starts as a numpy array 28x28 and is converted to a bytestring to enable it to be encoded as part of the Tf.record.
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
if isinstance(value, type(tf.constant(0))): # if value ist tensor
value = value.numpy() # get value of tensor
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def serialize_array(array):
array = tf.io.serialize_tensor(array)
return array
def image_label_to_tf_train(image, label):
image_shape = np.shape(image)
#define the dictionary -- the structure -- of our single example
data = {
'height': _int64_feature(image_shape[0]),
'width': _int64_feature(image_shape[1]),
'raw_image' : _bytes_feature(serialize_array(image)),
'label' : _int64_feature(label)
}
#create an Example, wrapping the single features
return tf.train.Example(features=tf.train.Features(feature=data))
def write_images_to_tfr_short(images, labels, filename:str="images", folder = ""):
if not os.path.isdir(folder):
!mkdir {folder}
filename= folder + "/" + filename+".tfrecords"
writer = tf.io.TFRecordWriter(filename) #create a writer that'll store our data to disk
count = 0
for index in range(len(images)):
#get the data we want to write
current_image = images[index]
current_label = labels[index]
out = image_label_to_tf_train(image=current_image, label=current_label)
writer.write(out.SerializeToString())
count += 1
writer.close()
print(f"Wrote {count} elements to TFRecord")
return count
The next stage is to call the transform component which uses the preprocessing_fn. This function should process all the data so for example divide the image array by 255 is a standard feature process. But the image is still as a bytestring and I can't for the life of me figure out how to turn it back into an array. The below is what I have tried.
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
# Initialize outputs dictionary
outputs = {}
raw_image_dataset = inputs[_IMAGE_KEY]
img = tf.io.decode_raw(raw_image_dataset, tf.int64)
outputs[_IMAGE_KEY] = img
outputs[_LABEL_KEY] = tf.cast(inputs[_LABEL_KEY], tf.int64)
return outputs
I get the following error:
WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Tuple[Dict[str, Union[NoneType, _Dataset]], Union[Dict[str, Dict[str, PCollection]], NoneType], int] instead.
WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Tuple[Dict[str, Union[NoneType, _Dataset]], Union[Dict[str, Dict[str, PCollection]], NoneType], int] instead.
WARNING:root:Make sure that locally built Python SDK docker image has Python 3.8 interpreter.
INFO:tensorflow:Assets written to: ./pipeline/Transform/transform_graph/225/.temp_path/tftransform_tmp/26150ae80de847fab932efeb0f0c610f/assets
INFO:tensorflow:Assets written to: ./pipeline/Transform/transform_graph/225/.temp_path/tftransform_tmp/26150ae80de847fab932efeb0f0c610f/assets
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.PerWindowInvoker.invoke_process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window()
/opt/conda/lib/python3.8/site-packages/apache_beam/transforms/core.py in <lambda>(x, *args, **kwargs)
1636 if fn_takes_side_inputs(fn):
-> 1637 wrapper = lambda x, *args, **kwargs: [fn(x, *args, **kwargs)]
1638 else:
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/beam/impl.py in _create_v2_saved_model(tensor_replacement_map, base_temp_dir, preprocessing_fn, input_signature, baseline_analyzers_fingerprint, output_keys_to_name_map)
662 saved_model_dir = beam_common.get_unique_temp_path(base_temp_dir)
--> 663 impl_helper.trace_and_write_v2_saved_model(saved_model_dir, preprocessing_fn,
664 input_signature, base_temp_dir,
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/impl_helper.py in trace_and_write_v2_saved_model(saved_model_dir, preprocessing_fn, input_signature, base_temp_dir, baseline_analyzers_fingerprint, tensor_replacement_map, output_keys_to_name_map)
893 analyzer_nodes.TENSOR_REPLACEMENTS):
--> 894 metadata = _trace_and_get_metadata(concrete_transform_fn, structured_inputs,
895 preprocessing_fn, base_temp_dir,
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/impl_helper.py in _trace_and_get_metadata(concrete_transform_fn, structured_inputs, preprocessing_fn, base_temp_dir, tensor_replacement_map)
805 return dataset_metadata.DatasetMetadata(
--> 806 schema=schema_inference.infer_feature_schema_v2(
807 concrete_transform_fn.structured_outputs,
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/schema_inference.py in infer_feature_schema_v2(features, concrete_metadata_fn, evaluate_schema_overrides)
255 metadata)
--> 256 return _infer_feature_schema_common(
257 features,
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/schema_inference.py in _infer_feature_schema_common(features, tensor_ranges, feature_annotations, global_annotations, is_evaluation_complete)
300 min=min_value, max=max_value, is_categorical=True)
--> 301 feature_spec = _feature_spec_from_batched_tensors(features,
302 is_evaluation_complete)
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/schema_inference.py in _feature_spec_from_batched_tensors(tensors, is_evaluation_complete)
128 dim is None for dim in shape.as_list()[1:]):
--> 129 raise ValueError(
130 'Feature {} ({}) had invalid shape {} for FixedLenFeature: apart '
ValueError: Feature raw_image (Tensor("Identity_1:0", shape=(None, 1, None), dtype=int64)) had invalid shape (None, 1, None) for FixedLenFeature: apart from the batch dimension, all dimensions must have known size
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-37-7beafa4fe436> in <module>
3 schema=schema_gen.outputs['schema'],
4 module_file=os.path.abspath(_mnist_transform_module))
----> 5 context.run(transform, enable_cache=False)
/opt/conda/lib/python3.8/site-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run_if_ipython(*args, **kwargs)
61 # __IPYTHON__ variable is set by IPython, see
62 # https://ipython.org/ipython-doc/rel-0.10.2/html/interactive/reference.html#embedding-ipython.
---> 63 return fn(*args, **kwargs)
64 else:
65 absl.logging.warning(
/opt/conda/lib/python3.8/site-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run(self, component, enable_cache, beam_pipeline_args)
181 telemetry_utils.LABEL_TFX_RUNNER: runner_label,
182 }):
--> 183 execution_id = launcher.launch().execution_id
184
185 return execution_result.ExecutionResult(
/opt/conda/lib/python3.8/site-packages/tfx/orchestration/launcher/base_component_launcher.py in launch(self)
198 # be immutable in this context.
199 # output_dict can still be changed, specifically properties.
--> 200 self._run_executor(execution_decision.execution_id,
201 copy.deepcopy(execution_decision.input_dict),
202 execution_decision.output_dict,
/opt/conda/lib/python3.8/site-packages/tfx/orchestration/launcher/in_process_component_launcher.py in _run_executor(self, execution_id, input_dict, output_dict, exec_properties)
71 # be immutable in this context.
72 # output_dict can still be changed, specifically properties.
---> 73 executor.Do(
74 copy.deepcopy(input_dict), output_dict, copy.deepcopy(exec_properties))
/opt/conda/lib/python3.8/site-packages/tfx/components/transform/executor.py in Do(self, input_dict, output_dict, exec_properties)
581 # remove the `_pip_dependencies` attribute.
582 with udf_utils.TempPipInstallContext(self._pip_dependencies):
--> 583 TransformProcessor().Transform(label_inputs, label_outputs, status_file)
584 logging.debug('Cleaning up temp path %s on executor success', temp_path)
585 io_utils.delete_dir(temp_path)
/opt/conda/lib/python3.8/site-packages/tfx/components/transform/executor.py in Transform(***failed resolving arguments***)
1114 materialization_format = (
1115 transform_paths_file_formats[-1] if materialize_output_paths else None)
-> 1116 self._RunBeamImpl(analyze_data_list, transform_data_list, preprocessing_fn,
1117 stats_options_updater_fn, force_tf_compat_v1,
1118 input_dataset_metadata, transform_output_path,
/opt/conda/lib/python3.8/site-packages/tfx/components/transform/executor.py in _RunBeamImpl(self, analyze_data_list, transform_data_list, preprocessing_fn, stats_options_updater_fn, force_tf_compat_v1, input_dataset_metadata, transform_output_path, raw_examples_data_format, temp_path, input_cache_dir, output_cache_dir, disable_statistics, per_set_stats_output_paths, materialization_format, analyze_paths_count, stats_output_paths, make_beam_pipeline_fn)
1496 for dataset in transform_data_list:
1497 infix = 'TransformIndex{}'.format(dataset.index)
-> 1498 (dataset.transformed
1499 | 'EncodeAndSerialize[{}]'.format(infix) >> beam.ParDo(
1500 self._RecordBatchToExamplesFn(transformed_schema_proto))
/opt/conda/lib/python3.8/site-packages/apache_beam/pipeline.py in __exit__(self, exc_type, exc_val, exc_tb)
594 try:
595 if not exc_type:
--> 596 self.result = self.run()
597 self.result.wait_until_finish()
598 finally:
/opt/conda/lib/python3.8/site-packages/apache_beam/pipeline.py in run(self, test_runner_api)
571 finally:
572 shutil.rmtree(tmpdir)
--> 573 return self.runner.run_pipeline(self, self._options)
574 finally:
575 if not is_in_ipython():
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/direct/direct_runner.py in run_pipeline(self, pipeline, options)
129 runner = BundleBasedDirectRunner()
130
--> 131 return runner.run_pipeline(pipeline, options)
132
133
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in run_pipeline(self, pipeline, options)
197 options.view_as(pipeline_options.ProfilingOptions))
198
--> 199 self._latest_run_result = self.run_via_runner_api(
200 pipeline.to_runner_api(default_environment=self._default_environment))
201 return self._latest_run_result
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in run_via_runner_api(self, pipeline_proto)
208 # TODO(pabloem, BEAM-7514): Create a watermark manager (that has access to
209 # the teststream (if any), and all the stages).
--> 210 return self.run_stages(stage_context, stages)
211
212 #contextlib.contextmanager
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in run_stages(self, stage_context, stages)
393 )
394
--> 395 stage_results = self._run_stage(
396 runner_execution_context, bundle_context_manager)
397
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in _run_stage(self, runner_execution_context, bundle_context_manager)
658 while True:
659 last_result, deferred_inputs, fired_timers, watermark_updates = (
--> 660 self._run_bundle(
661 runner_execution_context,
662 bundle_context_manager,
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in _run_bundle(self, runner_execution_context, bundle_context_manager, data_input, data_output, input_timers, expected_timer_output, bundle_manager)
781 expected_timer_output)
782
--> 783 result, splits = bundle_manager.process_bundle(
784 data_input, data_output, input_timers, expected_timer_output)
785 # Now we collect all the deferred inputs remaining from bundle execution.
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in process_bundle(self, inputs, expected_outputs, fired_timers, expected_output_timers, dry_run)
1092 process_bundle_descriptor.id,
1093 cache_tokens=[next(self._cache_token_generator)]))
-> 1094 result_future = self._worker_handler.control_conn.push(process_bundle_req)
1095
1096 split_results = [] # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse]
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/worker_handlers.py in push(self, request)
376 self._uid_counter += 1
377 request.instruction_id = 'control_%s' % self._uid_counter
--> 378 response = self.worker.do_instruction(request)
379 return ControlFuture(request.instruction_id, response)
380
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py in do_instruction(self, request)
578 if request_type:
579 # E.g. if register is set, this will call self.register(request.register))
--> 580 return getattr(self, request_type)(
581 getattr(request, request_type), request.instruction_id)
582 else:
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py in process_bundle(self, request, instruction_id)
616 with self.maybe_profile(instruction_id):
617 delayed_applications, requests_finalization = (
--> 618 bundle_processor.process_bundle(instruction_id))
619 monitoring_infos = bundle_processor.monitoring_infos()
620 monitoring_infos.extend(self.state_cache_metrics_fn())
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/bundle_processor.py in process_bundle(self, instruction_id)
993 element.timer_family_id, timer_data)
994 elif isinstance(element, beam_fn_api_pb2.Elements.Data):
--> 995 input_op_by_transform_id[element.transform_id].process_encoded(
996 element.data)
997
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/bundle_processor.py in process_encoded(self, encoded_windowed_values)
219 decoded_value = self.windowed_coder_impl.decode_from_stream(
220 input_stream, True)
--> 221 self.output(decoded_value)
222
223 def monitoring_infos(self, transform_id, tag_to_pcollection_id):
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.Operation.output()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.Operation.output()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.SingletonConsumerSet.receive()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.DoOperation.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.DoOperation.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner._reraise_augmented()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.SimpleInvoker.invoke_process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common._OutputProcessor.process_outputs()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.SingletonConsumerSet.receive()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.DoOperation.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.DoOperation.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner._reraise_augmented()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.SimpleInvoker.invoke_process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common._OutputProcessor.process_outputs()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.SingletonConsumerSet.receive()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.DoOperation.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.DoOperation.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner._reraise_augmented()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.DoFnRunner.process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.PerWindowInvoker.invoke_process()
/opt/conda/lib/python3.8/site-packages/apache_beam/runners/common.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window()
/opt/conda/lib/python3.8/site-packages/apache_beam/transforms/core.py in <lambda>(x, *args, **kwargs)
1635 from apache_beam.transforms.util import fn_takes_side_inputs
1636 if fn_takes_side_inputs(fn):
-> 1637 wrapper = lambda x, *args, **kwargs: [fn(x, *args, **kwargs)]
1638 else:
1639 wrapper = lambda x: [fn(x)]
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/beam/impl.py in _create_v2_saved_model(tensor_replacement_map, base_temp_dir, preprocessing_fn, input_signature, baseline_analyzers_fingerprint, output_keys_to_name_map)
661 """
662 saved_model_dir = beam_common.get_unique_temp_path(base_temp_dir)
--> 663 impl_helper.trace_and_write_v2_saved_model(saved_model_dir, preprocessing_fn,
664 input_signature, base_temp_dir,
665 baseline_analyzers_fingerprint,
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/impl_helper.py in trace_and_write_v2_saved_model(saved_model_dir, preprocessing_fn, input_signature, base_temp_dir, baseline_analyzers_fingerprint, tensor_replacement_map, output_keys_to_name_map)
892 if not concrete_transform_fn.graph.get_collection(
893 analyzer_nodes.TENSOR_REPLACEMENTS):
--> 894 metadata = _trace_and_get_metadata(concrete_transform_fn, structured_inputs,
895 preprocessing_fn, base_temp_dir,
896 tensor_replacement_map)
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/impl_helper.py in _trace_and_get_metadata(concrete_transform_fn, structured_inputs, preprocessing_fn, base_temp_dir, tensor_replacement_map)
804 evaluate_schema_overrides=True)
805 return dataset_metadata.DatasetMetadata(
--> 806 schema=schema_inference.infer_feature_schema_v2(
807 concrete_transform_fn.structured_outputs,
808 concrete_metadata_fn,
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/schema_inference.py in infer_feature_schema_v2(features, concrete_metadata_fn, evaluate_schema_overrides)
254 tensor_annotations, global_annotations = _get_schema_annotations_v2(
255 metadata)
--> 256 return _infer_feature_schema_common(
257 features,
258 tensor_ranges,
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/schema_inference.py in _infer_feature_schema_common(features, tensor_ranges, feature_annotations, global_annotations, is_evaluation_complete)
299 domains[name] = schema_pb2.IntDomain(
300 min=min_value, max=max_value, is_categorical=True)
--> 301 feature_spec = _feature_spec_from_batched_tensors(features,
302 is_evaluation_complete)
303
/opt/conda/lib/python3.8/site-packages/tensorflow_transform/schema_inference.py in _feature_spec_from_batched_tensors(tensors, is_evaluation_complete)
127 if is_evaluation_complete and any(
128 dim is None for dim in shape.as_list()[1:]):
--> 129 raise ValueError(
130 'Feature {} ({}) had invalid shape {} for FixedLenFeature: apart '
131 'from the batch dimension, all dimensions must have known size'
ValueError: Feature raw_image (Tensor("Identity_1:0", shape=(None, 1, None), dtype=int64)) had invalid shape (None, 1, None) for FixedLenFeature: apart from the batch dimension, all dimensions must have known size [while running 'Analyze/CreateSavedModel[tf_v2_only]/CreateSavedModel']
I know the label feature is working as I can call the below code and get a print as so....
transform = tfx.components.Transform(
examples=example_gen.outputs['examples'],
schema=schema_gen.outputs['schema'],
module_file=os.path.abspath(_mnist_transform_module))
context.run(transform, enable_cache=False)
# Get the URI of the output artifact representing the transformed examples
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'Split-train')
# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
for name in os.listdir(train_uri)]
# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")
# Decode the first record and print output
for tfrecord in dataset.take(1):
serialized_example = tfrecord.numpy()
example = tf.train.Example()
example.ParseFromString(serialized_example)
print(example)
IF I remove the lines:
img = tf.io.decode_raw(raw_image_dataset, tf.int64)
outputs[_IMAGE_KEY] = img
I get printed
features {
feature {
key: "label"
value {
int64_list {
value: 5
}
}
}
}
This shows what I am doing to the label feature is working but I really can't figure how to transform the image bytes. Part of the issue is I'm not completely sure what the format is as it's just a tensor which is pretty opaque. It seems given the label operation I'm operating on a column of data effectively but again, can't figure the correct operation or syntax

For any future viewers this works
raw_image_dataset = tf.map_fn(fn = lambda x : tf.io.parse_tensor(x[0], tf.uint8, name=None), elems = raw_image_dataset, fn_output_signature = tf.TensorSpec((28,28),dtype=tf.uint8, name=None), infer_shape = True)
raw_image_dataset = tf.cast(raw_image_dataset, tf.int64)
outputs[_IMAGE_KEY] = raw_image_dataset

So I think I solved this using
raw_image_dataset = inputs[_IMAGE_KEY]
raw_image_dataset = tf.map_fn(fn = lambda x : tf.io.decode_image(x[0]) , elems = raw_image_dataset, dtype=tf.uint8)
Theres something about the data going in as a batch so needing to map it and also using the right component of the resulting tensor "x[0]", I'm still not 100% sure on why this is the case but it seems to run.
Now I'm struggling with TFX as it won't let me output features that are different to the what went in...

How to convert 2 column dataframe to dictionary without turning keys into a list

I will preface this by saying I'm a very amateur user and though I've researched my problem extensively I have not found a solution. I assume the solution is simple, but we will see.
Simplified, I have a dataframe with column names A, B, C, D, etc., and I want to change those names to a, b, c, d, etc.. The list of column names is long so in order to achieve these I've imported a dataframe from an excel file with 2 columns (I used excel here because I want to create an easily reproducible method for the entire program I'm creating). The first column has A, B, C, D... and the second column has a, b, c, d.
I then took this dataframe, set the index to column 0, and transposed it. I then used .to_dict('list') and the resulting dictionary looks almost correct except that the values are in lists: {'A':['a'], 'B':['b']...}. So when I try to execute df.rename(columns=dictionary) I get the unhashable type list error.
I know this is because my values are stored as lists, if the dictionary looked like {'A':'a', 'B':'b'...} I'm betting it would work fine.
So basically, how do I turn my dataframe into a dictionary without lists that is formatted as such? Or is this not possible and I should approach this in a different way?
Thanks!
Here is my actual code:
INPUT
df_plate = pd.read_excel('plate.xlsx',index_col='sample')
df_plate_t = df_plate.T
dict_plate = df_plate_t.to_dict('list')
df_sorted2 = df_sorted.rename(columns=dict_plate)
df_sorted2
OUTPUT
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\lib\pretty.py in pretty(self, obj)
400 if cls is not object \
401 and callable(cls.__dict__.get('__repr__')):
--> 402 return _repr_pprint(obj, self, cycle)
403
404 return _default_pprint(obj, self, cycle)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\lib\pretty.py in _repr_pprint(obj, p, cycle)
695 """A pprint that just redirects to the normal repr function."""
696 # Find newlines and replace them with p.break_()
--> 697 output = repr(obj)
698 for idx,output_line in enumerate(output.splitlines()):
699 if idx:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\base.py in __repr__(self)
76 Yields Bytestring in Py2, Unicode String in py3.
77 """
---> 78 return str(self)
79
80
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\base.py in __str__(self)
55
56 if compat.PY3:
---> 57 return self.__unicode__()
58 return self.__bytes__()
59
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __unicode__(self)
632 width = None
633 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 634 line_width=width, show_dimensions=show_dimensions)
635
636 return buf.getvalue()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width)
719 decimal=decimal,
720 line_width=line_width)
--> 721 formatter.to_string()
722
723 if buf is None:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in to_string(self)
596 else:
597
--> 598 strcols = self._to_str_columns()
599 if self.line_width is None: # no need to wrap around just print
600 # the whole frame
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in _to_str_columns(self)
527 str_columns = [[label] for label in self.header]
528 else:
--> 529 str_columns = self._get_formatted_column_labels(frame)
530
531 stringified = []
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in _get_formatted_column_labels(self, frame)
770 need_leadsp[x] else x]
771 for i, (col, x) in enumerate(zip(columns,
--> 772 fmt_columns))]
773
774 if self.show_row_idx_names:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in <listcomp>(.0)
769 str_columns = [[' ' + x if not self._get_formatter(i) and
770 need_leadsp[x] else x]
--> 771 for i, (col, x) in enumerate(zip(columns,
772 fmt_columns))]
773
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in _get_formatter(self, i)
363 if is_integer(i) and i not in self.columns:
364 i = self.columns[i]
--> 365 return self.formatters.get(i, None)
366
367
TypeError: unhashable type: 'list'
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _repr_html_(self)
672
673 return self.to_html(max_rows=max_rows, max_cols=max_cols,
--> 674 show_dimensions=show_dimensions, notebook=True)
675 else:
676 return None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in to_html(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, bold_rows, classes, escape, notebook, border, table_id, render_links)
2263 render_links=render_links)
2264 # TODO: a generic formatter wld b in DataFrameFormatter
-> 2265 formatter.to_html(classes=classes, notebook=notebook, border=border)
2266
2267 if buf is None:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in to_html(self, classes, notebook, border)
727 from pandas.io.formats.html import HTMLFormatter, NotebookFormatter
728 Klass = NotebookFormatter if notebook else HTMLFormatter
--> 729 html = Klass(self, classes=classes, border=border).render()
730 if hasattr(self.buf, 'write'):
731 buffer_put_lines(self.buf, html)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\html.py in render(self)
527 self.write('<div>')
528 self.write_style()
--> 529 super(NotebookFormatter, self).render()
530 self.write('</div>')
531 return self.elements
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\html.py in render(self)
144
145 def render(self):
--> 146 self._write_table()
147
148 if self.should_show_dimensions:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\html.py in _write_table(self, indent)
180 self._write_header(indent + self.indent_delta)
181
--> 182 self._write_body(indent + self.indent_delta)
183
184 self.write('</table>', indent)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\html.py in _write_body(self, indent)
323 def _write_body(self, indent):
324 self.write('<tbody>', indent)
--> 325 fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
326
327 # write values
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\html.py in <dictcomp>(.0)
323 def _write_body(self, indent):
324 self.write('<tbody>', indent)
--> 325 fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
326
327 # write values
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in _format_col(self, i)
702 def _format_col(self, i):
703 frame = self.tr_frame
--> 704 formatter = self._get_formatter(i)
705 values_to_format = frame.iloc[:, i]._formatting_values()
706 return format_array(values_to_format, formatter,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\formats\format.py in _get_formatter(self, i)
363 if is_integer(i) and i not in self.columns:
364 i = self.columns[i]
--> 365 return self.formatters.get(i, None)
366
367
TypeError: unhashable type: 'list'

Yup it was an easy solution. If you want to do this and don't know how (probably not many of you out there...) then you want to use a series rather than a dataframe with keys=index and values=column.
dict_plate = pd.Series(df_plate['condition'].values,index=df_plate['sample']).to_dict()

ValueError when running MaskedAutoregressiveFlow example

I am trying to run the example for MaskedAutoregressiveFlow at https://www.tensorflow.org/api_docs/python/tf/contrib/distributions/bijectors/MaskedAutoregressiveFlow. It's a plain copy from the docs but I receive the following error. I've tried event_shape=[dims, 1] but that doesn't seem to help (different error). I'm not sure what to make of it.
Has anyone seen this as well?
import tensorflow as tf
import tensorflow.contrib.distributions as tfd
from tensorflow.contrib.distributions import bijectors as tfb
dims = 5
# A common choice for a normalizing flow is to use a Gaussian for the base
# distribution. (However, any continuous distribution would work.) E.g.,
maf = tfd.TransformedDistribution(
distribution=tfd.Normal(loc=0., scale=1.),
bijector=tfb.MaskedAutoregressiveFlow(
shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
hidden_layers=[512, 512])),
event_shape=[dims])
x = maf.sample() # Expensive; uses `tf.while_loop`, no Bijector caching.
maf.log_prob(x) # Almost free; uses Bijector caching.
maf.log_prob(0.) # Cheap; no `tf.while_loop` despite no Bijector caching.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-3b2fcb2af309> in <module>()
11
12
---> 13 x = maf.sample() # Expensive; uses `tf.while_loop`, no Bijector caching.
14 maf.log_prob(x) # Almost free; uses Bijector caching.
15 maf.log_prob(0.) # Cheap; no `tf.while_loop` despite no Bijector caching.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/distribution.py in sample(self, sample_shape, seed, name)
687 samples: a `Tensor` with prepended dimensions `sample_shape`.
688 """
--> 689 return self._call_sample_n(sample_shape, seed, name)
690
691 def _log_prob(self, value):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/transformed_distribution.py in _call_sample_n(self, sample_shape, seed, name, **kwargs)
411 # work, it is imperative that this is the last modification to the
412 # returned result.
--> 413 y = self.bijector.forward(x, **kwargs)
414 y = self._set_sample_static_shape(y, sample_shape)
415
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/bijector_impl.py in forward(self, x, name)
618 NotImplementedError: if `_forward` is not implemented.
619 """
--> 620 return self._call_forward(x, name)
621
622 def _inverse(self, y):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/bijector_impl.py in _call_forward(self, x, name, **kwargs)
599 if mapping.y is not None:
600 return mapping.y
--> 601 mapping = mapping.merge(y=self._forward(x, **kwargs))
602 self._cache(mapping)
603 return mapping.y
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py in _forward(self, x)
245 y0 = array_ops.zeros_like(x, name="y0")
246 # call the template once to ensure creation
--> 247 _ = self._shift_and_log_scale_fn(y0)
248 def _loop_body(index, y0):
249 """While-loop body for autoregression calculation."""
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/template.py in __call__(self, *args, **kwargs)
358 custom_getter=self._custom_getter) as vs:
359 self._variable_scope = vs
--> 360 result = self._call_func(args, kwargs)
361 return result
362
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/template.py in _call_func(self, args, kwargs)
300 trainable_at_start = len(
301 ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
--> 302 result = self._func(*args, **kwargs)
303
304 if self._variables_created:
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py in _fn(x)
478 activation=activation,
479 *args,
--> 480 **kwargs)
481 x = masked_dense(
482 inputs=x,
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py in masked_dense(inputs, units, num_blocks, exclusive, kernel_initializer, reuse, name, *args, **kwargs)
386 *args,
387 **kwargs)
--> 388 return layer.apply(inputs)
389
390
/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py in apply(self, inputs, *args, **kwargs)
807 Output tensor(s).
808 """
--> 809 return self.__call__(inputs, *args, **kwargs)
810
811 def _add_inbound_node(self,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py in __call__(self, inputs, *args, **kwargs)
671
672 # Check input assumptions set before layer building, e.g. input rank.
--> 673 self._assert_input_compatibility(inputs)
674 if input_list and self._dtype is None:
675 try:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py in _assert_input_compatibility(self, inputs)
1195 ', found ndim=' + str(ndim) +
1196 '. Full shape received: ' +
-> 1197 str(x.get_shape().as_list()))
1198 # Check dtype.
1199 if spec.dtype is not None:
ValueError: Input 0 of layer dense_1 is incompatible with the layer: : expected min_ndim=2, found ndim=1. Full shape received: [5]
originally defined at:
File "<ipython-input-2-3b2fcb2af309>", line 9, in <module>
hidden_layers=[512, 512])),
File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py", line 499, in masked_autoregressive_default_template
"masked_autoregressive_default_template", _fn)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/template.py", line 152, in make_template
**kwargs)

Pandas groupby and describe flags AttributeError

I have a bunch of data stored in vals. The indices are monotonic, but not continuous. I'm attempting to do some analysis on histograms of the data, so I've created the following structure:
hist = pd.DataFrame(vals)
hist['bins'] = pd.cut(vals, 100)
This is data taken from an experimental instrument and I know that some of the bins have only 1 or 2 counts in them, which I'm trying to remove. I've tried using groupby as follows and get the following error (Full traceback included at the end of the note):
hist.groupby('bins').describe()
AttributeError: 'Categorical' object has no attribute 'flags'
However, when I do the following, the error does not show up and I get the expected result:
In[]: hist.index = hist.bins
In[]: hist['bins'] = hist.index
In[]: desc = hist.groupby('bins').describe()
In[]: desc.index.names = ['bins', 'describe']
Out[]: **describe with MultiIndex for rows.**
If I don't include the second line hist['bins'] = hist.index, I still get an AttributeError: 'Categorical' object has no attribute 'flags' and to the best that I can tell, the traceback is identical.
Can someone explain what the flags are and why they only seem to work when I set the index to bins and then replace the bins by the version stored in the index?
My end goal is to remove the data for bins with counts <= 6. If someone has an easier workaround than the way I'm going after it, I'd also be grateful.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-f606a051f2e4> in <module>()
----> 1 hist.groupby('bins').describe()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in __call__(self, result)
245 self.start_displayhook()
246 self.write_output_prompt()
--> 247 format_dict, md_dict = self.compute_format_data(result)
248 self.write_format_data(format_dict, md_dict)
249 self.update_user_ns(result)
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in compute_format_data(self, result)
155
156 """
--> 157 return self.shell.display_formatter.format(result)
158
159 def write_format_data(self, format_dict, md_dict=None):
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in format(self, obj, include, exclude)
150 md = None
151 try:
--> 152 data = formatter(obj)
153 except:
154 # FIXME: log the exception
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in __call__(self, obj)
479 type_pprinters=self.type_printers,
480 deferred_pprinters=self.deferred_printers)
--> 481 printer.pretty(obj)
482 printer.flush()
483 return stream.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in pretty(self, obj)
360 if callable(meth):
361 return meth(obj, self, cycle)
--> 362 return _default_pprint(obj, self, cycle)
363 finally:
364 self.end_group()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in _default_pprint(obj, p, cycle)
480 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
481 # A user-provided repr.
--> 482 p.text(repr(obj))
483 return
484 p.begin_group(1, '<')
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __repr__(self)
62 Yields Bytestring in Py2, Unicode String in py3.
63 """
---> 64 return str(self)
65
66
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __str__(self)
42 if compat.PY3:
43 return self.__unicode__()
---> 44 return self.__bytes__()
45
46 def __bytes__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __bytes__(self)
54
55 encoding = get_option("display.encoding")
---> 56 return self.__unicode__().encode(encoding, 'replace')
57
58 def __repr__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in __unicode__(self)
507 width = None
508 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 509 line_width=width, show_dimensions=show_dimensions)
510
511 return buf.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, line_width, max_rows, max_cols, show_dimensions)
1340 max_rows=max_rows,
1341 max_cols=max_cols,
-> 1342 show_dimensions=show_dimensions)
1343 formatter.to_string()
1344
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in __init__(self, frame, buf, columns, col_space, header, index, na_rep, formatters, justify, float_format, sparsify, index_names, line_width, max_rows, max_cols, show_dimensions, **kwds)
345 self.columns = frame.columns
346
--> 347 self._chk_truncate()
348
349 def _chk_truncate(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in _chk_truncate(self)
410 else:
411 row_num = max_rows_adj // 2
--> 412 frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :]))
413 self.tr_row_num = row_num
414
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_new_axes(self)
957 new_axes[i] = ax
958
--> 959 new_axes[self.axis] = self._get_concat_axis()
960 return new_axes
961
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_concat_axis(self)
1009
1010 if self.keys is None:
-> 1011 concat_axis = _concat_indexes(indexes)
1012 else:
1013 concat_axis = _make_concat_multiindex(indexes, self.keys,
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _concat_indexes(indexes)
1027
1028 def _concat_indexes(indexes):
-> 1029 return indexes[0].append(indexes[1:])
1030
1031
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in append(self, other)
4603 arrays = []
4604 for i in range(self.nlevels):
-> 4605 label = self.get_level_values(i)
4606 appended = [o.get_level_values(i) for o in other]
4607 arrays.append(label.append(appended))
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in get_level_values(self, level)
4239 unique = self.levels[num] # .values
4240 labels = self.labels[num]
-> 4241 filled = com.take_1d(unique.values, labels, fill_value=unique._na_value)
4242 values = unique._simple_new(filled, self.names[num],
4243 freq=getattr(unique, 'freq', None),
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\common.pyc in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
829 out_shape[axis] = len(indexer)
830 out_shape = tuple(out_shape)
--> 831 if arr.flags.f_contiguous and axis == arr.ndim - 1:
832 # minor tweak that can make an order-of-magnitude difference
833 # for dataframes initialized directly from 2-d ndarrays
AttributeError: 'Categorical' object has no attribute 'flags'

This looks to be be a bug with Categorical data that will be corrected in version 0.17.0 (issue here).
In the meantime, you could just cast the category to an object dtype - this is what was happening when you assigned to the index and back.
df['bins'] = df['bins'].astype(str)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

xarray.open_mfdataset "ValueError: axes don't match array" - python

Related

Numpy command to calculate sine (and cosine) consumes all RAM

Image processing in Tensor flow TFX pipelines

How to convert 2 column dataframe to dictionary without turning keys into a list

ValueError when running MaskedAutoregressiveFlow example

Pandas groupby and describe flags AttributeError

Categories

Resources