Replace multiple "less than values" in different columns in pandas dataframe - python

I am working with python and pandas. I have a dataset of lab analysis where I am dealing with multiple parameters and detection limits(dl). Many of the samples are reported as below the dl (e.g.<dl,<4)
For example:
import pandas as pd
df=pd.DataFrame([['<4','88.72','<0.09'],['<1','5','<0.09'],['2','17.6','<0.09']], columns=['var_1','var_2','var_3'])
df
My goal is to replace all <dl with dl/2 as a float value.
I can do this for one column pretty easily.
df['var_3'] = df.var_3.str.replace('<' ,'').astype(float)
df['var_3'] = df['var_3'].apply(lambda x: x/2 if x == 0.09 else x)
df
but this requires me looking at the dl and inputting it.
I would like to streamline it to apply it across all variables with one or more detection limits per variable as I have many variables and the detection limit will not always be constant from data frame to data frame this is applied to.
I found something similar in R but not sure how to apply it in python. Any solutions would be appreciated.
Update
So the
df=df.replace(r'<(.*)', r'\1/2', regex=True).apply(pd.eval)
works well with dataframe with just columns with numbers. I assume that is a limitation of the eval function. For some reason I can get the code to work on smaller dataframes but after I concatenate them the code will not work on the larger dataframe and I get this error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/9_/w2qcdj_x2x5852py8xl6b0sh0000gn/T/ipykernel_9403/3946462310.py in <module>
----> 1 MS=MS.replace(r'<(.*)', r'\1/2', regex=True).apply(pd.eval)
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwargs)
8738 kwargs=kwargs,
8739 )
-> 8740 return op.apply()
8741
8742 def applymap(
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/apply.py in apply(self)
686 return self.apply_raw()
687
--> 688 return self.apply_standard()
689
690 def agg(self):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/apply.py in apply_standard(self)
810
811 def apply_standard(self):
--> 812 results, res_index = self.apply_series_generator()
813
814 # wrap results
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/apply.py in apply_series_generator(self)
826 for i, v in enumerate(series_gen):
827 # ignore SettingWithCopy here in case the user mutates
--> 828 results[i] = self.f(v)
829 if isinstance(results[i], ABCSeries):
830 # If we have a view on v, we need to make a copy because
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/computation/eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
351 eng = ENGINES[engine]
352 eng_inst = eng(parsed_expr)
--> 353 ret = eng_inst.evaluate()
354
355 if parsed_expr.assigner is None:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/computation/engines.py in evaluate(self)
78
79 # make sure no names in resolvers and locals/globals clash
---> 80 res = self._evaluate()
81 return reconstruct_object(
82 self.result_type, res, self.aligned_axes, self.expr.terms.return_type
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/computation/engines.py in _evaluate(self)
119 scope = env.full_scope
120 _check_ne_builtin_clash(self.expr)
--> 121 return ne.evaluate(s, local_dict=scope)
122
123
~/opt/anaconda3/lib/python3.9/site-packages/numexpr/necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
821
822 # Create a signature
--> 823 signature = [(name, getType(arg)) for (name, arg) in
824 zip(names, arguments)]
825
~/opt/anaconda3/lib/python3.9/site-packages/numexpr/necompiler.py in <listcomp>(.0)
821
822 # Create a signature
--> 823 signature = [(name, getType(arg)) for (name, arg) in
824 zip(names, arguments)]
825
~/opt/anaconda3/lib/python3.9/site-packages/numexpr/necompiler.py in getType(a)
703 if kind == 'U':
704 raise ValueError('NumExpr 2 does not support Unicode as a dtype.')
--> 705 raise ValueError("unknown type %s" % a.dtype.name)
706
707
ValueError: unknown type object

Use replace instead str.replace than eval all expressions:
>>> df.replace(r'<(.*)', r'\1/2', regex=True).apply(pd.eval)
var_1 var_2 var_3
0 2.0 88.72 0.045
1 0.5 5.00 0.045
2 2.0 17.60 0.045
\1 will be replace by the first capture group .*
Update
Alternative:
out = df.melt(ignore_index=False)
m = out['value'].str.startswith('<')
out.loc[m, 'value'] = out.loc[m, 'value'].str.strip('<').astype(float) / 2
out = out.reset_index().pivot('index', 'variable', 'value') \
.rename_axis(index=None, columns=None)
Output:
>>> out
var_1 var_2 var_3
0 2.0 88.72 0.045
1 0.5 5 0.045
2 2 17.6 0.045
Update
Alternative using melt to flatten your dataframe and pivot to reshape to your original dataframe:
df1 = df.melt(ignore_index=False)
m = df1['value'].str.startswith('<')
df1['value'] = df1['value'].mask(~m).str[1:].astype(float).div(2) \
.fillna(df1['value']).astype(float)
df1 = df1.reset_index().pivot_table('value', 'index', 'variable') \
.rename_axis(index=None, columns=None)
Output:
>>> df1
var_1 var_2 var_3
0 2.0 88.72 0.045
1 0.5 5.00 0.045
2 2.0 17.60 0.045

Related

Error with pd.IntervalIndex.from_arrays during groupby merge

I have two dataframes. They look like this:
df_a
Framecount probability
0 0.0 [0.00019486549333333332, 4.883635666666667e-06...
1 1.0 [0.00104359155, 3.9232405e-05, 0.0015722045000...
2 2.0 [0.00048501002666666667, 1.668179e-05, 0.00052...
3 3.0 [4.994969500000001e-05, 4.0931635e-07, 0.00011...
4 4.0 [0.0004808829, 5.389742e-05, 0.002522127933333...
.. ... ...
906 906.0 [1.677140566666667e-05, 1.1745095666666665e-06...
907 907.0 [1.5164155000000002e-05, 7.66629575e-07, 0.000...
908 908.0 [8.1334184e-05, 0.00012675669636333335, 0.0028...
909 909.0 [0.00014893802999999998, 1.0407592500000001e-0...
910 910.0 [4.178489e-05, 2.17477925e-06, 0.02094931, 0.0...
And:
df_b
start stop
0 12.12 12.47
1 13.44 20.82
2 20.88 29.63
3 31.61 33.33
4 33.44 42.21
.. ... ...
228 880.44 887.92
229 888.63 892.07
230 892.13 895.30
231 895.31 900.99
232 907.58 908.35
I want to merge df_a.probability onto df_b when df_a.Framecount is in between df_b.start and df_b.stop. The aggregation statistic for df_a.probability should be mean. df_a.probability is dtype np array.
I am trying to adapt my answer from this code:
df_text.idx = pd.IntervalIndex.from_arrays(df_b['start'],df_b['stop'],closed='both')
df_a['Framecount'].apply(lambda x : df_b.iloc[df_b.idx.get_loc(x)]['probability'].mean())
but it breaks with the first line with this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-200-0c75f94bf6e2> in <module>
----> 1 df_textidx = pd.IntervalIndex.from_arrays(df_text['start'],df_text['stop'],closed='both')
2 df_vid['Framecount'].apply(lambda x : df_text.iloc[df_text.idx.get_loc(x)]['probability'])
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/interval.py in from_arrays(cls, left, right, closed, name, copy, dtype)
314 with rewrite_exception("IntervalArray", cls.__name__):
315 array = IntervalArray.from_arrays(
--> 316 left, right, closed, copy=copy, dtype=dtype
317 )
318 return cls._simple_new(array, name=name)
~/anaconda3/lib/python3.7/site-packages/pandas/core/arrays/interval.py in from_arrays(cls, left, right, closed, copy, dtype)
380
381 return cls._simple_new(
--> 382 left, right, closed, copy=copy, dtype=dtype, verify_integrity=True
383 )
384
~/anaconda3/lib/python3.7/site-packages/pandas/core/arrays/interval.py in _simple_new(cls, left, right, closed, copy, dtype, verify_integrity)
239 result._closed = closed
240 if verify_integrity:
--> 241 result._validate()
242 return result
243
~/anaconda3/lib/python3.7/site-packages/pandas/core/arrays/interval.py in _validate(self)
486 if not (self.left[left_mask] <= self.right[left_mask]).all():
487 msg = "left side of interval must be <= right side"
--> 488 raise ValueError(msg)
489
490 # ---------
ValueError: left side of interval must be <= right side
Why is this happening? IIUC the left side is <= right side...
All left side values must be less than right side values, which isn't an issue with the test data provided.
For pd.IntervalIndex.from_arrays, the left and right parameter require a 1-d array.
.from_arrays(df_b['start'], df_b['stop']) passes a pandas.DataFrame
df_b['start'].values will pass an np.array.
Either should work.
Also see pandas: Indexing with an IntervalIndex
import pandas as pd
# setup the test dataframe
data = {'start': [12.12, 13.44, 20.88, 31.61, 33.44, 880.44, 888.63, 892.13, 895.31, 907.58], 'stop': [12.47, 20.82, 29.63, 33.33, 42.21, 887.92, 892.07, 895.3, 900.99, 908.35]}
df = pd.DataFrame(data)
# create the interval index by passing the column as np.array
idx = pd.IntervalIndex.from_arrays(df.start.values, df.stop.values, closed='both')
# display(idx)
IntervalIndex([[12.12, 12.47], [13.44, 20.82], [20.88, 29.63], [31.61, 33.33], [33.44, 42.21], [880.44, 887.92], [888.63, 892.07], [892.13, 895.3], [895.31, 900.99], [907.58, 908.35]],
closed='both',
dtype='interval[float64]')

Python Pandas Style to every nth row

I'm working on a Python project w/ Pandas and looking to implement a style to every Nth row. I've been able to select every Nth row using iloc but cannot get the style to work with a basic function. Here's my example in context:
data = [[1,2,3],[2,3,4],[3,4,5],[4,5,6]]
df = pd.DataFrame(data)
df
0 1 2
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
df.iloc[1::2, :]
0 1 2
1 2 3 4
3 4 5 6
At this point everything returns as normal, but when applying the below function, I receive a too many indexes error which I can't seem to resolve
def highlight_everyother(s):
if s.iloc[1::2, :]:
return ['background-color: yellow']*3
df.style.apply(highlight_everyother, axis=1)
ERROR:
IndexingError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _repr_html_(self)
180 Hooks into Jupyter notebook rich display system.
181 """
--> 182 return self.render()
183
184 #Appender(
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in render(self, **kwargs)
535 * table_attributes
536 """
--> 537 self._compute()
538 # TODO: namespace all the pandas keys
539 d = self._translate()
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _compute(self)
610 r = self
611 for func, args, kwargs in self._todo:
--> 612 r = func(self)(*args, **kwargs)
613 return r
614
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _apply(self, func, axis, subset, **kwargs)
618 data = self.data.loc[subset]
619 if axis is not None:
--> 620 result = data.apply(func, axis=axis, result_type="expand", **kwargs)
621 result.columns = data.columns
622 else:
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
6876 kwds=kwds,
6877 )
-> 6878 return op.get_result()
6879
6880 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
184 return self.apply_raw()
185
--> 186 return self.apply_standard()
187
188 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
311
312 # compute the result using the series generator
--> 313 results, res_index = self.apply_series_generator()
314
315 # wrap results
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
339 else:
340 for i, v in enumerate(series_gen):
--> 341 results[i] = self.f(v)
342 keys.append(v.name)
343
<ipython-input-49-a5b996f8d6c8> in highlight_everyother(s)
11
12 def highlight_everyother(s):
---> 13 if s.iloc[1::2, :]:
14 return ['background-color: yellow']*3
15
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)
699 for i, k in enumerate(key):
700 if i >= self.ndim:
--> 701 raise IndexingError("Too many indexers")
702 try:
703 self._validate_key(k, i)
IndexingError: Too many indexers
Any help would be appreciated. Thank you.
I would apply on axis=0 in case df is not index by rangeIndex:
def highlight_everyother(s):
return ['background-color: yellow; color:blue' if x%2==1 else ''
for x in range(len(s))]
df.style.apply(highlight_everyother)
Output:
You are passing one row at a time to highlight_everyother. That's why you were getting the error. The below should work.
def highlight_everyother(s):
if s.name%2==1:
return ['background-color: yellow']*3
else:
return ['background-color: white']*3
df.style.apply(highlight_everyother, axis=1)

Apply a function to all rows using 2 columns and creating a new column

I have a Dataframe with two columns (date as index)
date col_a col_b
1.1.2020 23 34
and I want to apply a function using the 2 columns and creating a third one (error)
date col_a col_b col_error
1.1.2020 23 34 0.4
here is my Dataframe
and below the code with the TypeError (sorry I can't provide the data :( )
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Apply a lambda function to each row by calculating the error
result['mape'] = result.apply(mean_absolute_percentage_error(test_df, pred_df))
result
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-125-681609c31734> in <module>
4
5 # Apply a lambda function to each row by calculating the error
----> 6 result['mape'] = result.apply(mean_absolute_percentage_error(test_df, pred_df))
7 result
~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6911 kwds=kwds,
6912 )
-> 6913 return op.get_result()
6914
6915 def applymap(self, func):
~/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
184 return self.apply_raw()
185
--> 186 return self.apply_standard()
187
188 def apply_empty_result(self):
~/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in apply_standard(self)
290
291 # compute the result using the series generator
--> 292 self.apply_series_generator()
293
294 # wrap results
~/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py in apply_series_generator(self)
319 try:
320 for i, v in enumerate(series_gen):
--> 321 results[i] = self.f(v)
322 keys.append(v.name)
323 except Exception as e:
TypeError: ("'numpy.float64' object is not callable", 'occurred at index Night_Cons(+)')
The simplest way to manipulate the columns and add a new one would be something like this:
import pandas as pd
df = pd.DataFrame({'col_a': [23], 'col_b': [34]},
index=[pd.to_datetime('2020-01-01')])
print(df)
col_a col_b
2020-01-01 23 34
df['col_error'] = abs((df['col_a'] - df['col_b'])/df['col_b'])
print(df)
col_a col_b col_error
2020-01-01 23 34 0.323529
To create a new column from the existing one you can simply use a lambda function. For instance, if you your new column to be the difference from the previous ones:
df['new_col'] = df.apply(lambda row: row.col_1 - row.col_2, axis=1)
Hope this helps!

Aggregations over specific columns of a large dataframe, with named output

I am looking for a way to aggregate over a large dataframe, possibly using groupby. Each group would be based on either pre-specified columns or regex, and the aggregation should produce a named output.
This produces a sample dataframe:
import pandas as pd
import itertools
import numpy as np
col = "A,B,C".split(',')
col1 = "1,2,3,4,5,6,7,8,9".split(',')
col2 = "E,F,G".split(',')
all_dims = [col, col1, col2]
all_keys = ['.'.join(i) for i in itertools.product(*all_dims)]
rng = pd.date_range(end=pd.Timestamp.today().date(), periods=12, freq='M')
df = pd.DataFrame(np.random.randint(0, 1000, size=(len(rng), len(all_keys))), columns=all_keys, index=rng)
Above produces a dataframe with one year's worth of monthly data, with 36 columns with following names:
['A.1.E', 'A.1.F', 'A.1.G', 'A.2.E', 'A.2.F', 'A.2.G', 'A.3.E', 'A.3.F',
'A.3.G', 'A.4.E', 'A.4.F', 'A.4.G', 'A.5.E', 'A.5.F', 'A.5.G', 'A.6.E',
'A.6.F', 'A.6.G', 'A.7.E', 'A.7.F', 'A.7.G', 'A.8.E', 'A.8.F', 'A.8.G',
'A.9.E', 'A.9.F', 'A.9.G', 'B.1.E', 'B.1.F', 'B.1.G', 'B.2.E', 'B.2.F',
'B.2.G', 'B.3.E', 'B.3.F', 'B.3.G', 'B.4.E', 'B.4.F', 'B.4.G', 'B.5.E',
'B.5.F', 'B.5.G', 'B.6.E', 'B.6.F', 'B.6.G', 'B.7.E', 'B.7.F', 'B.7.G',
'B.8.E', 'B.8.F', 'B.8.G', 'B.9.E', 'B.9.F', 'B.9.G', 'C.1.E', 'C.1.F',
'C.1.G', 'C.2.E', 'C.2.F', 'C.2.G', 'C.3.E', 'C.3.F', 'C.3.G', 'C.4.E',
'C.4.F', 'C.4.G', 'C.5.E', 'C.5.F', 'C.5.G', 'C.6.E', 'C.6.F', 'C.6.G',
'C.7.E', 'C.7.F', 'C.7.G', 'C.8.E', 'C.8.F', 'C.8.G', 'C.9.E', 'C.9.F',
'C.9.G']
What I would like now is to be able aggregate over the dataframe and take certain column combinations and produce named outputs. For example, one rules might be that I will take all 'A.*.E' columns (that have any number in the middle), sum them and produce a named output column called 'A.SUM.E'. And then do the same for 'A.*.F', 'A.*.G' and so on.
I have looked into pandas 25 named aggregation which allows me to name my outputs but I couldn't see how to simultaneously capture the right column combinations and produce the right output names.
If you need to reshape the dataframe to make a workable solution, that is fine as well.
Note, I am aware I could do something like this in a Python loop but I am looking for a pandas way to do it.
Not a groupby solution and it uses a loop but I think it's nontheless rather elegant: first get a list of unique column from - to combinations using a set and then do the sums using filter:
cols = sorted([(x[0],x[1]) for x in set([(x.split('.')[0], x.split('.')[-1]) for x in df.columns])])
for c0, c1 in cols:
df[f'{c0}.SUM.{c1}'] = df.filter(regex = f'{c0}\.\d+\.{c1}').sum(axis=1)
Result:
A.1.E A.1.F A.1.G A.2.E ... B.SUM.G C.SUM.E C.SUM.F C.SUM.G
2018-08-31 978 746 408 109 ... 4061 5413 4102 4908
2018-09-30 923 649 488 447 ... 5585 3634 3857 4228
2018-10-31 911 359 897 425 ... 5039 2961 5246 4126
2018-11-30 77 479 536 509 ... 4634 4325 2975 4249
2018-12-31 608 995 114 603 ... 5377 5277 4509 3499
2019-01-31 138 612 363 218 ... 4514 5088 4599 4835
2019-02-28 994 148 933 990 ... 3907 4310 3906 3552
2019-03-31 950 931 209 915 ... 4354 5877 4677 5557
2019-04-30 255 168 357 800 ... 5267 5200 3689 5001
2019-05-31 593 594 824 986 ... 4221 2108 4636 3606
2019-06-30 975 396 919 242 ... 3841 4787 4556 3141
2019-07-31 350 312 104 113 ... 4071 5073 4829 3717
If you want to have the result in a new DataFrame, just create an empty one and add the columns to it:
result = pd.DataFrame()
for c0, c1 in cols:
result[f'{c0}.SUM.{c1}'] = df.filter(regex = f'{c0}\.\d+\.{c1}').sum(axis=1)
Update: using simple groupby (which is even more simple in this particular case):
def grouper(col):
c = col.split('.')
return f'{c[0]}.SUM.{c[-1]}'
df.groupby(grouper, axis=1).sum()

Possible bug with `xarray.Dataset.groupby()`?

I'm using Xarray version 0.8.0, Python 3.5.1, on Mac OS X El Capitan 10.11.6.
The following code works as expected.
id_data_array = xarray.DataArray([280, 306, 280], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 280
score (index) float64 0.8358 0.7536 0.9495
======
<xarray.Dataset>
Dimensions: (id: 2)
Coordinates:
* id (id) int64 280 306
Data variables:
score (id) int64 2 1
In [ ]:
However, if I change just one little thing, to make the elements of id_data_array all distinct, then there is an error.
Code:
id_data_array = xarray.DataArray([280, 306, 120], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 120
score (index) float64 0.1353 0.0437 0.1687
======
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-92-cc412270ba2e> in <module>()
5 print(score_dataset)
6 print("======")
----> 7 print(score_dataset.groupby("id").count())
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, **kwargs)
44 return self.reduce(func, dim, keep_attrs,
45 numeric_only=numeric_only, allow_lazy=True,
---> 46 **kwargs)
47 return wrapped_func
48
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
605 def reduce_dataset(ds):
606 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 607 return self.apply(reduce_dataset)
608
609 def assign(self, **kwargs):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in apply(self, func, **kwargs)
562 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
563 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 564 combined = self._concat(applied)
565 result = self._maybe_restore_empty_groups(combined)
566 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in _concat(self, applied)
570 concat_dim, positions = self._infer_concat_args(applied_example)
571
--> 572 combined = concat(applied, concat_dim)
573 reordered = _maybe_reorder(combined, concat_dim, positions)
574 return reordered
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
114 raise TypeError('can only concatenate xarray Dataset and DataArray '
115 'objects, got %s' % type(first_obj))
--> 116 return f(objs, dim, data_vars, coords, compat, positions)
117
118
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
276 if coord is not None:
277 # add concat dimension last to ensure that its in the final Dataset
--> 278 result[coord.name] = coord
279
280 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
536 raise NotImplementedError('cannot yet use a dictionary as a key '
537 'to set Dataset values')
--> 538 self.update({key: value})
539
540 def __delitem__(self, key):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in update(self, other, inplace)
1434 dataset.
1435 """
-> 1436 variables, coord_names, dims = dataset_update_method(self, other)
1437
1438 return self._replace_vars_and_dims(variables, coord_names, dims,
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
490 priority_arg = 1
491 indexes = dataset.indexes
--> 492 return merge_core(objs, priority_arg=priority_arg, indexes=indexes)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
371
372 coerced = coerce_pandas_values(objs)
--> 373 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
374 expanded = expand_variable_dicts(aligned)
375
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in deep_align(list_of_variable_maps, join, copy, indexes)
146 out.append(variables)
147
--> 148 aligned = partial_align(*targets, join=join, copy=copy, indexes=indexes)
149
150 for key, aligned_obj in zip(keys, aligned):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in partial_align(*objects, **kwargs)
109 valid_indexers = dict((k, v) for k, v in joined_indexes.items()
110 if k in obj.dims)
--> 111 result.append(obj.reindex(copy=copy, **valid_indexers))
112 return tuple(result)
113
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, **kw_indexers)
1216
1217 variables = alignment.reindex_variables(
-> 1218 self.variables, self.indexes, indexers, method, tolerance, copy=copy)
1219 return self._replace_vars_and_dims(variables)
1220
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in reindex_variables(variables, indexes, indexers, method, tolerance, copy)
218 target = utils.safe_cast_to_index(indexers[name])
219 indexer = index.get_indexer(target, method=method,
--> 220 **get_indexer_kwargs)
221
222 to_shape[name] = len(target)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
2080
2081 if not self.is_unique:
-> 2082 raise InvalidIndexError('Reindexing only valid with uniquely'
2083 ' valued Index objects')
2084
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
To me this seems buggy because if this is the desired behaviour then it would be very strange. Surely, we should include the case when all the elements of the DataArray we're grouping by are distinct?
Update
I've now uninstalled and reinstalled Xarray. The new Xarray is version 0.8.1, and it seems to work fine. So it may indeed be a bug in Xarray 0.8.0.

Categories

Resources