Possible bug with `xarray.Dataset.groupby()`? - python

I'm using Xarray version 0.8.0, Python 3.5.1, on Mac OS X El Capitan 10.11.6.
The following code works as expected.
id_data_array = xarray.DataArray([280, 306, 280], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 280
score (index) float64 0.8358 0.7536 0.9495
======
<xarray.Dataset>
Dimensions: (id: 2)
Coordinates:
* id (id) int64 280 306
Data variables:
score (id) int64 2 1
In [ ]:
However, if I change just one little thing, to make the elements of id_data_array all distinct, then there is an error.
Code:
id_data_array = xarray.DataArray([280, 306, 120], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 120
score (index) float64 0.1353 0.0437 0.1687
======
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-92-cc412270ba2e> in <module>()
5 print(score_dataset)
6 print("======")
----> 7 print(score_dataset.groupby("id").count())
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, **kwargs)
44 return self.reduce(func, dim, keep_attrs,
45 numeric_only=numeric_only, allow_lazy=True,
---> 46 **kwargs)
47 return wrapped_func
48
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
605 def reduce_dataset(ds):
606 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 607 return self.apply(reduce_dataset)
608
609 def assign(self, **kwargs):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in apply(self, func, **kwargs)
562 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
563 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 564 combined = self._concat(applied)
565 result = self._maybe_restore_empty_groups(combined)
566 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in _concat(self, applied)
570 concat_dim, positions = self._infer_concat_args(applied_example)
571
--> 572 combined = concat(applied, concat_dim)
573 reordered = _maybe_reorder(combined, concat_dim, positions)
574 return reordered
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
114 raise TypeError('can only concatenate xarray Dataset and DataArray '
115 'objects, got %s' % type(first_obj))
--> 116 return f(objs, dim, data_vars, coords, compat, positions)
117
118
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
276 if coord is not None:
277 # add concat dimension last to ensure that its in the final Dataset
--> 278 result[coord.name] = coord
279
280 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
536 raise NotImplementedError('cannot yet use a dictionary as a key '
537 'to set Dataset values')
--> 538 self.update({key: value})
539
540 def __delitem__(self, key):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in update(self, other, inplace)
1434 dataset.
1435 """
-> 1436 variables, coord_names, dims = dataset_update_method(self, other)
1437
1438 return self._replace_vars_and_dims(variables, coord_names, dims,
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
490 priority_arg = 1
491 indexes = dataset.indexes
--> 492 return merge_core(objs, priority_arg=priority_arg, indexes=indexes)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
371
372 coerced = coerce_pandas_values(objs)
--> 373 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
374 expanded = expand_variable_dicts(aligned)
375
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in deep_align(list_of_variable_maps, join, copy, indexes)
146 out.append(variables)
147
--> 148 aligned = partial_align(*targets, join=join, copy=copy, indexes=indexes)
149
150 for key, aligned_obj in zip(keys, aligned):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in partial_align(*objects, **kwargs)
109 valid_indexers = dict((k, v) for k, v in joined_indexes.items()
110 if k in obj.dims)
--> 111 result.append(obj.reindex(copy=copy, **valid_indexers))
112 return tuple(result)
113
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, **kw_indexers)
1216
1217 variables = alignment.reindex_variables(
-> 1218 self.variables, self.indexes, indexers, method, tolerance, copy=copy)
1219 return self._replace_vars_and_dims(variables)
1220
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in reindex_variables(variables, indexes, indexers, method, tolerance, copy)
218 target = utils.safe_cast_to_index(indexers[name])
219 indexer = index.get_indexer(target, method=method,
--> 220 **get_indexer_kwargs)
221
222 to_shape[name] = len(target)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
2080
2081 if not self.is_unique:
-> 2082 raise InvalidIndexError('Reindexing only valid with uniquely'
2083 ' valued Index objects')
2084
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
To me this seems buggy because if this is the desired behaviour then it would be very strange. Surely, we should include the case when all the elements of the DataArray we're grouping by are distinct?
Update
I've now uninstalled and reinstalled Xarray. The new Xarray is version 0.8.1, and it seems to work fine. So it may indeed be a bug in Xarray 0.8.0.

Related

MemoryError when using pandas_profiling profile_report

I'm trying to profile an excel file, it is a very small data set, only 30 columns and 535 rows, but when I run the profile_report function it stops each time in a different percentage but always has the same message:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-41-283dd2cb2000> in <module>
1 df=pd.read_excel(path_working+'Documents/Information/'+'sample.xlsx')
2 profile = df.profile_report(title='Sample Exploratory')
----> 3 profile.to_file(path_working+'sample.html')
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
276 create_html_assets(output_file)
277
--> 278 data = self.to_html()
279
280 if output_file.suffix != ".html":
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
384
385 """
--> 386 return self.html
387
388 def to_json(self) -> str:
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
199 def html(self):
200 if self._html is None:
--> 201 self._html = self._render_html()
202 return self._html
203
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
306 from pandas_profiling.report.presentation.flavours import HTMLReport
307
--> 308 report = self.report
309
310 disable_progress_bar = not config["progress_bar"].get(bool)
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
193 def report(self):
194 if self._report is None:
--> 195 self._report = get_report_structure(self.description_set)
196 return self._report
197
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
172 def description_set(self):
173 if self._description_set is None:
--> 174 self._description_set = describe_df(
175 self.title, self.df, self.summarizer, self.typeset, self._sample
176 )
~\anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df, summarizer, typeset, sample)
72 total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
73 ) as pbar:
---> 74 series_description = get_series_descriptions(df, summarizer, typeset, pbar)
75
76 pbar.set_postfix_str("Get variable types")
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, summarizer, typeset, pbar)
97 # TODO: use `Pool` for Linux-based systems
98 with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 99 for i, (column, description) in enumerate(
100 executor.imap_unordered(multiprocess_1d, args)
101 ):
~\anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 __next__ = next # XXX
~\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
76 """
77 column, series = args
---> 78 return column, describe_1d(series, summarizer, typeset)
79
80 pool_size = config["pool_size"].get(int)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series, summarizer, typeset)
50 vtype = typeset.detect_type(series)
51
---> 52 return summarizer.summarize(series, dtype=vtype)
53
54
~\anaconda3\lib\site-packages\pandas_profiling\model\summarizer.py in summarize(self, series, dtype)
54 """
55 summarizer_func = compose(self.summary_map.get(dtype, []))
---> 56 _, summary = summarizer_func(series, {"type": dtype})
57 return summary
58
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
17 def func(f, g):
18 def func2(*x):
---> 19 res = g(*x)
20 if type(res) == bool:
21 return f(*x)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in inner(series, summary)
70 if not summary["hashable"]:
71 return series, summary
---> 72 return fn(series, summary)
73
74 return inner
~\anaconda3\lib\site-packages\visions\utils\series_utils.py in inner(series, state, *args, **kwargs)
40 return False
41
---> 42 return fn(series, state, *args, **kwargs)
43
44 return inner
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in describe_numeric_1d(series, summary)
208
209 if chi_squared_threshold > 0.0:
--> 210 stats["chi_squared"] = chi_square(finite_values)
211
212 stats["range"] = stats["max"] - stats["min"]
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_helpers.py in chi_square(values, histogram)
352 def chi_square(values=None, histogram=None):
353 if histogram is None:
--> 354 histogram, _ = np.histogram(values, bins="auto")
355 return dict(chisquare(histogram)._asdict())
356
<__array_function__ internals> in histogram(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
790 a, weights = _ravel_and_check_weights(a, weights)
791
--> 792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
793
794 # Histogram is an integer or a float array depending on the weights.
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
444
445 # bin edges must be computed
--> 446 bin_edges = np.linspace(
447 first_edge, last_edge, n_equal_bins + 1,
448 endpoint=True, dtype=bin_type)
<__array_function__ internals> in linspace(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
126
127 delta = stop - start
--> 128 y = _nx.arange(0, num, dtype=dt).reshape((-1,) + (1,) * ndim(delta))
129 # In-place multiplication y *= delta/div is faster, but prevents the multiplicant
130 # from overriding what class is produced, and thus prevents, e.g. use of Quantities,
MemoryError: Unable to allocate 1.75 EiB for an array with shape (251938683619878560,) and data type float64
I ran the same code in a different python installation and it ran fine.
Thank you all in advance and let me know if you need more information.
This is a bug in numpy.histogram (https://github.com/numpy/numpy/issues/10297), also reported on SO (Numpy histogram extremely slow on small data set).
This error is caused by the call to np.histogram(x, bin='auto'). When input has very large values, the "auto" method can fail while trying to generate an enormous number of bin that cannot fit in ram.
As a workaround, you can remove the large values manually before generating the report.

Python Pandas Style to every nth row

I'm working on a Python project w/ Pandas and looking to implement a style to every Nth row. I've been able to select every Nth row using iloc but cannot get the style to work with a basic function. Here's my example in context:
data = [[1,2,3],[2,3,4],[3,4,5],[4,5,6]]
df = pd.DataFrame(data)
df
0 1 2
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
df.iloc[1::2, :]
0 1 2
1 2 3 4
3 4 5 6
At this point everything returns as normal, but when applying the below function, I receive a too many indexes error which I can't seem to resolve
def highlight_everyother(s):
if s.iloc[1::2, :]:
return ['background-color: yellow']*3
df.style.apply(highlight_everyother, axis=1)
ERROR:
IndexingError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _repr_html_(self)
180 Hooks into Jupyter notebook rich display system.
181 """
--> 182 return self.render()
183
184 #Appender(
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in render(self, **kwargs)
535 * table_attributes
536 """
--> 537 self._compute()
538 # TODO: namespace all the pandas keys
539 d = self._translate()
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _compute(self)
610 r = self
611 for func, args, kwargs in self._todo:
--> 612 r = func(self)(*args, **kwargs)
613 return r
614
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _apply(self, func, axis, subset, **kwargs)
618 data = self.data.loc[subset]
619 if axis is not None:
--> 620 result = data.apply(func, axis=axis, result_type="expand", **kwargs)
621 result.columns = data.columns
622 else:
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
6876 kwds=kwds,
6877 )
-> 6878 return op.get_result()
6879
6880 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
184 return self.apply_raw()
185
--> 186 return self.apply_standard()
187
188 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
311
312 # compute the result using the series generator
--> 313 results, res_index = self.apply_series_generator()
314
315 # wrap results
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
339 else:
340 for i, v in enumerate(series_gen):
--> 341 results[i] = self.f(v)
342 keys.append(v.name)
343
<ipython-input-49-a5b996f8d6c8> in highlight_everyother(s)
11
12 def highlight_everyother(s):
---> 13 if s.iloc[1::2, :]:
14 return ['background-color: yellow']*3
15
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)
699 for i, k in enumerate(key):
700 if i >= self.ndim:
--> 701 raise IndexingError("Too many indexers")
702 try:
703 self._validate_key(k, i)
IndexingError: Too many indexers
Any help would be appreciated. Thank you.
I would apply on axis=0 in case df is not index by rangeIndex:
def highlight_everyother(s):
return ['background-color: yellow; color:blue' if x%2==1 else ''
for x in range(len(s))]
df.style.apply(highlight_everyother)
Output:
You are passing one row at a time to highlight_everyother. That's why you were getting the error. The below should work.
def highlight_everyother(s):
if s.name%2==1:
return ['background-color: yellow']*3
else:
return ['background-color: white']*3
df.style.apply(highlight_everyother, axis=1)

scipy sparse matrix negative column index found

I am trying to run the PolynomialFeatures from sklearn with a large sparse matrix as input:
[1] x
>>> <11967295x120006 sparse matrix of type '<class 'numpy.int64'>'
with 55375058 stored elements in Compressed Sparse Row format>
[2] from sklearn.preprocessing import PolynomialFeatures
[3] pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)
[4] xinter = pf.fit_transform(x)
And got the error ValueError: negative column index found:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-78-dc5dc18d59d2> in <module>
1 start = time.time()
----> 2 xinter = pf.fit_transform(x)
3 end = time.time()
/venv/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
/venv/lib/python3.6/site-packages/sklearn/preprocessing/data.py in transform(self, X)
1523 break
1524 to_stack.append(Xp_next)
-> 1525 XP = sparse.hstack(to_stack, format='csr')
1526 elif sparse.isspmatrix_csc(X) and self.degree < 4:
1527 return self.transform(X.tocsr()).tocsc()
/venv/lib/python3.6/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/venv/lib/python3.6/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
572 for j in range(N):
573 if blocks[i,j] is not None:
--> 574 A = coo_matrix(blocks[i,j])
575 blocks[i,j] = A
576 block_mask[i,j] = True
/venv/lib/python3.6/site-packages/scipy/sparse/coo.py in __init__(self, arg1, shape, dtype, copy)
170 self._shape = check_shape(arg1.shape)
171 else:
--> 172 coo = arg1.tocoo()
173 self.row = coo.row
174 self.col = coo.col
/venv/lib/python3.6/site-packages/scipy/sparse/compressed.py in tocoo(self, copy)
1015 from .coo import coo_matrix
1016 return coo_matrix((self.data, (row, col)), self.shape, copy=copy,
-> 1017 dtype=self.dtype)
1018
1019 tocoo.__doc__ = spmatrix.tocoo.__doc__
/venv/lib/python3.6/site-packages/scipy/sparse/coo.py in __init__(self, arg1, shape, dtype, copy)
196 self.data = self.data.astype(dtype, copy=False)
197
--> 198 self._check()
199
200 def reshape(self, *args, **kwargs):
/venv/lib/python3.6/site-packages/scipy/sparse/coo.py in _check(self)
289 raise ValueError('negative row index found')
290 if self.col.min() < 0:
--> 291 raise ValueError('negative column index found')
292
293 def transpose(self, axes=None, copy=False):
ValueError: negative column index found
This does not look right because the col index type is int64:
> /venv/lib/python3.6/site-packages/scipy/sparse/coo.py(291)_check()
289 raise ValueError('negative row index found')
290 if self.col.min() < 0:
--> 291 raise ValueError('negative column index found')
292
293 def transpose(self, axes=None, copy=False):
ipdb> self.col.max()
2147482788
ipdb> self.col.dtype
dtype('int64')
ipdb> self.col.min()
-2147480639
I am using the following versions:
scipy.__version__
'1.3.1'
sklearn.__version__
'0.21.2'
I appreciate any help to figure out this issue!

Value Error: Couldn't Convert String to Float

I have two input spreadsheets.
Sheet 1 has 7 columns and 3 rows
/ FID / Total / A1 / B1 / A2 / B2
1 / 1 / 0.720168405 / 0.635589112 / XXX / 0.031112358 / YYY
1 / 2 / 0.760438562 / 0.328168557 / YYY / 0.311172576 / ZZZ
Sheet 2 has 2 columns and 4 rows
/ 0
XXX / 0.55
YYY / 0.52
ZZZ / 0.35
This is the code:
import pandas as pd
df = pd.read_excel("C:/Users/Sheet1.xls")
df2 = pd.read_excel("C:/Users/Sheet2.xlsx")
dictionary = df2.to_dict(orient='dict')
b = df.filter(like ='A').values
c = df.filter(like ='B').replace(dictionary[0]).astype(float).values
df['AA'] = ((c * b).sum(axis =1))
df['BB'] = df.AA / df.Total
def custom_round(x, base=5):
return base * round(float(x)/base)
df['C'] = df['BB'].apply(lambda x: custom_round(x, base=.05))
df['C'] = "X = " + df['C'].apply(lambda s: '{:,.2f}'.format(s))
df.to_excel("C:/Users/Results.xlsx")
print(df)
I have got error message: Value Error Couldn't Convert String to Float: XXX
ValueError Traceback (most recent call last)
<ipython-input-1-f42c7cb99da5> in <module>()
8
9 b = df.filter(like ='A').values
---> 10 c = df.filter(like ='B').replace(dictionary[0]).astype(float).values
11
12 df['AA'] = ((c * b).sum(axis =1))
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\generic.pyc in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\internals\managers.pyc in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\internals\managers.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\internals\blocks.pyc in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\internals\blocks.pyc in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\dtypes\cast.pyc in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float: XXX
I see in the 6th line of your code you are trying to replace some set in a dataframe (XXX, YYY,.. to 0.55, 0.52, ..). But you end up supplying a dictionary
{0:55, 1:52,..} where the keys are actually array indices.
I have changed your sheet 2 header for easier indexing like
0 / 1
XXX / 0.55
YYY / 0.52
ZZZ / 0.35
and set index using existing column 0 by replacing your line 4 as,
dictionary = df2.set_index(0)[1].to_dict()
and your line 6 as,
c = df.filter(like ='B').replace(dictionary).astype(float).values
This supplied proper dictionary to replace the dataframes.

Index must be called with a collection of some kind: assign column name to dataframe

I have reweightTarget as follows and I want to convert it to a pandas Dataframe. However, I got following error:
TypeError: Index(...) must be called with a collection of some kind,
't' was passed
If I remove columns='t', it works fine. Can anyone please explain what's going on?
reweightTarget
Trading dates
2004-01-31 4.35
2004-02-29 4.46
2004-03-31 4.44
2004-04-30 4.39
2004-05-31 4.50
2004-06-30 4.53
2004-07-31 4.63
2004-08-31 4.58
dtype: float64
pd.DataFrame(reweightTarget, columns='t')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-334-bf438351aaf2> in <module>()
----> 1 pd.DataFrame(reweightTarget, columns='t')
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
253 else:
254 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 255 copy=copy)
256 elif isinstance(data, (list, types.GeneratorType)):
257 if isinstance(data, types.GeneratorType):
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_ndarray(self, values, index, columns, dtype, copy)
421 raise_with_traceback(e)
422
--> 423 index, columns = _get_axes(*values.shape)
424 values = values.T
425
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in _get_axes(N, K, index, columns)
388 columns = _default_index(K)
389 else:
--> 390 columns = _ensure_index(columns)
391 return index, columns
392
C:\Anaconda3\lib\site-packages\pandas\indexes\base.py in _ensure_index(index_like, copy)
3407 index_like = copy(index_like)
3408
-> 3409 return Index(index_like)
3410
3411
C:\Anaconda3\lib\site-packages\pandas\indexes\base.py in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
266 **kwargs)
267 elif data is None or lib.isscalar(data):
--> 268 cls._scalar_data_error(data)
269 else:
270 if (tupleize_cols and isinstance(data, list) and data and
C:\Anaconda3\lib\site-packages\pandas\indexes\base.py in _scalar_data_error(cls, data)
481 raise TypeError('{0}(...) must be called with a collection of some '
482 'kind, {1} was passed'.format(cls.__name__,
--> 483 repr(data)))
484
485 #classmethod
TypeError: Index(...) must be called with a collection of some kind, 't' was passed
Documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
columns : Index or array-like
Column labels to use for resulting frame. Will default to np.arange(n) if no column labels are provided
Example:
df3 = DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
Try to use:
pd.DataFrame(reweightTarget, columns=['t'])
When you want to set an index or columns in data frame you must pass it as a list, so either:
pd.DataFrame(reweightTarget, columns=['t'])
pd.DataFrame(reweightTarget, columns=list('t'))

Categories

Resources