I have a dataset in CSV format. I am trying to perform scaling in my dataset, but I am getting an error. As I understood, I need to convert from 3D to 2D. But I am not sure, how to do that.
Example of my dataset:
63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1
57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0
56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2
44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
57.0,1.0,3.0,150.0,168.0,0.0,0.0,174.0,0.0,1.6,1.0,0.0,3.0,0
48.0,1.0,2.0,110.0,229.0,0.0,0.0,168.0,0.0,1.0,3.0,0.0,7.0,1
54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0
My code:
import pandas as pd
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('processed_cleveland_data.csv')
ss = StandardScaler()
df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
Error:
ValueError
Traceback (most recent call last)
<ipython-input-5-6db223ceefcd> in <module>
4 df = pd.read_csv('processed_cleveland_data.csv')
5 ss = StandardScaler()
----> 6 df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
~\Miniconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
697 if y is None:
698 # fit method of arity 1 (unsupervised transformation)
--> 699 return self.fit(X, **fit_params).transform(X)
700 else:
701 # fit method of arity 2 (supervised transformation)
~\Miniconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~\Miniconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~\Miniconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
614 array = array.astype(dtype, casting="unsafe", copy=False)
615 else:
--> 616 array = np.asarray(array, order=order, dtype=dtype)
617 except ComplexWarning as complex_warning:
618 raise ValueError("Complex data not supported\n"
~\Miniconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
~\Miniconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
1897
1898 def __array__(self, dtype=None) -> np.ndarray:
-> 1899 return np.asarray(self._values, dtype=dtype)
1900
1901 def __array_wrap__(
~\Miniconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: '?'
Use na_values for convert ? to missing values:
df = pd.read_csv('processed_cleveland_data.csv', na_values='?')
#if csv has no header
#df = pd.read_csv('processed_cleveland_data.csv', na_values='?', header=None)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
I'm working on a Python project w/ Pandas and looking to implement a style to every Nth row. I've been able to select every Nth row using iloc but cannot get the style to work with a basic function. Here's my example in context:
data = [[1,2,3],[2,3,4],[3,4,5],[4,5,6]]
df = pd.DataFrame(data)
df
0 1 2
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
df.iloc[1::2, :]
0 1 2
1 2 3 4
3 4 5 6
At this point everything returns as normal, but when applying the below function, I receive a too many indexes error which I can't seem to resolve
def highlight_everyother(s):
if s.iloc[1::2, :]:
return ['background-color: yellow']*3
df.style.apply(highlight_everyother, axis=1)
ERROR:
IndexingError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _repr_html_(self)
180 Hooks into Jupyter notebook rich display system.
181 """
--> 182 return self.render()
183
184 #Appender(
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in render(self, **kwargs)
535 * table_attributes
536 """
--> 537 self._compute()
538 # TODO: namespace all the pandas keys
539 d = self._translate()
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _compute(self)
610 r = self
611 for func, args, kwargs in self._todo:
--> 612 r = func(self)(*args, **kwargs)
613 return r
614
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _apply(self, func, axis, subset, **kwargs)
618 data = self.data.loc[subset]
619 if axis is not None:
--> 620 result = data.apply(func, axis=axis, result_type="expand", **kwargs)
621 result.columns = data.columns
622 else:
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
6876 kwds=kwds,
6877 )
-> 6878 return op.get_result()
6879
6880 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
184 return self.apply_raw()
185
--> 186 return self.apply_standard()
187
188 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
311
312 # compute the result using the series generator
--> 313 results, res_index = self.apply_series_generator()
314
315 # wrap results
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
339 else:
340 for i, v in enumerate(series_gen):
--> 341 results[i] = self.f(v)
342 keys.append(v.name)
343
<ipython-input-49-a5b996f8d6c8> in highlight_everyother(s)
11
12 def highlight_everyother(s):
---> 13 if s.iloc[1::2, :]:
14 return ['background-color: yellow']*3
15
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)
699 for i, k in enumerate(key):
700 if i >= self.ndim:
--> 701 raise IndexingError("Too many indexers")
702 try:
703 self._validate_key(k, i)
IndexingError: Too many indexers
Any help would be appreciated. Thank you.
I would apply on axis=0 in case df is not index by rangeIndex:
def highlight_everyother(s):
return ['background-color: yellow; color:blue' if x%2==1 else ''
for x in range(len(s))]
df.style.apply(highlight_everyother)
Output:
You are passing one row at a time to highlight_everyother. That's why you were getting the error. The below should work.
def highlight_everyother(s):
if s.name%2==1:
return ['background-color: yellow']*3
else:
return ['background-color: white']*3
df.style.apply(highlight_everyother, axis=1)
I'm using min_max_scaler.fit_transform() to rescale each column in the Dataframe - df.
df[['A', 'B', 'C']] = min_max_scaler.fit_transform(df[['A', 'B', 'C']])
I got the ValueError: setting an array element with a sequence. However, this error only occurs when I process one of my csv files. All others work fine. I don't know where should I start to debug? Can anyone suggest some directions to figure out the issue?
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
569 if y is None:
570 # fit method of arity 1 (unsupervised transformation)
--> 571 return self.fit(X, **fit_params).transform(X)
572 else:
573 # fit method of arity 2 (supervised transformation)
~/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_data.py in fit(self, X, y)
337 # Reset internal state before fitting
338 self._reset()
--> 339 return self.partial_fit(X, y)
340
341 def partial_fit(self, X, y=None):
~/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y)
371 X = check_array(X,
372 estimator=self, dtype=FLOAT_DTYPES,
--> 373 force_all_finite="allow-nan")
374
375 data_min = np.nanmin(X, axis=0)
~/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
529 array = array.astype(dtype, casting="unsafe", copy=False)
530 else:
--> 531 array = np.asarray(array, order=order, dtype=dtype)
532 except ComplexWarning:
533 raise ValueError("Complex data not supported\n"
~/anaconda3/lib/python3.7/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.
I'm using Xarray version 0.8.0, Python 3.5.1, on Mac OS X El Capitan 10.11.6.
The following code works as expected.
id_data_array = xarray.DataArray([280, 306, 280], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 280
score (index) float64 0.8358 0.7536 0.9495
======
<xarray.Dataset>
Dimensions: (id: 2)
Coordinates:
* id (id) int64 280 306
Data variables:
score (id) int64 2 1
In [ ]:
However, if I change just one little thing, to make the elements of id_data_array all distinct, then there is an error.
Code:
id_data_array = xarray.DataArray([280, 306, 120], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 120
score (index) float64 0.1353 0.0437 0.1687
======
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-92-cc412270ba2e> in <module>()
5 print(score_dataset)
6 print("======")
----> 7 print(score_dataset.groupby("id").count())
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, **kwargs)
44 return self.reduce(func, dim, keep_attrs,
45 numeric_only=numeric_only, allow_lazy=True,
---> 46 **kwargs)
47 return wrapped_func
48
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
605 def reduce_dataset(ds):
606 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 607 return self.apply(reduce_dataset)
608
609 def assign(self, **kwargs):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in apply(self, func, **kwargs)
562 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
563 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 564 combined = self._concat(applied)
565 result = self._maybe_restore_empty_groups(combined)
566 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in _concat(self, applied)
570 concat_dim, positions = self._infer_concat_args(applied_example)
571
--> 572 combined = concat(applied, concat_dim)
573 reordered = _maybe_reorder(combined, concat_dim, positions)
574 return reordered
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
114 raise TypeError('can only concatenate xarray Dataset and DataArray '
115 'objects, got %s' % type(first_obj))
--> 116 return f(objs, dim, data_vars, coords, compat, positions)
117
118
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
276 if coord is not None:
277 # add concat dimension last to ensure that its in the final Dataset
--> 278 result[coord.name] = coord
279
280 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
536 raise NotImplementedError('cannot yet use a dictionary as a key '
537 'to set Dataset values')
--> 538 self.update({key: value})
539
540 def __delitem__(self, key):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in update(self, other, inplace)
1434 dataset.
1435 """
-> 1436 variables, coord_names, dims = dataset_update_method(self, other)
1437
1438 return self._replace_vars_and_dims(variables, coord_names, dims,
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
490 priority_arg = 1
491 indexes = dataset.indexes
--> 492 return merge_core(objs, priority_arg=priority_arg, indexes=indexes)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
371
372 coerced = coerce_pandas_values(objs)
--> 373 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
374 expanded = expand_variable_dicts(aligned)
375
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in deep_align(list_of_variable_maps, join, copy, indexes)
146 out.append(variables)
147
--> 148 aligned = partial_align(*targets, join=join, copy=copy, indexes=indexes)
149
150 for key, aligned_obj in zip(keys, aligned):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in partial_align(*objects, **kwargs)
109 valid_indexers = dict((k, v) for k, v in joined_indexes.items()
110 if k in obj.dims)
--> 111 result.append(obj.reindex(copy=copy, **valid_indexers))
112 return tuple(result)
113
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, **kw_indexers)
1216
1217 variables = alignment.reindex_variables(
-> 1218 self.variables, self.indexes, indexers, method, tolerance, copy=copy)
1219 return self._replace_vars_and_dims(variables)
1220
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in reindex_variables(variables, indexes, indexers, method, tolerance, copy)
218 target = utils.safe_cast_to_index(indexers[name])
219 indexer = index.get_indexer(target, method=method,
--> 220 **get_indexer_kwargs)
221
222 to_shape[name] = len(target)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
2080
2081 if not self.is_unique:
-> 2082 raise InvalidIndexError('Reindexing only valid with uniquely'
2083 ' valued Index objects')
2084
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
To me this seems buggy because if this is the desired behaviour then it would be very strange. Surely, we should include the case when all the elements of the DataArray we're grouping by are distinct?
Update
I've now uninstalled and reinstalled Xarray. The new Xarray is version 0.8.1, and it seems to work fine. So it may indeed be a bug in Xarray 0.8.0.