Reduce xarray.Dataset by custom function - python

I want to use xarray functionality to reduce a dataset by a custom/external function across a named dimension.
Create dataset to demonstrate the problem
import xarray as xr
import numpy as np
import pandas as pd
time = pd.date_range("2000-01-01", "2001-01-01", freq="D")
sids = np.arange(4)
obs = np.random.random(size=(len(time), len(sids)))
sim = np.random.random(size=(len(time), len(sids)))
original = xr.Dataset({"obs": (("time", "station_id"), obs), "sim": (("time", "station_id"), sim)}, coords={"time": time, "station_id": sids})
I want to calculate the mean_squared_error using the two variables in original, calculating the metric by collapsing the "time" dimension. This should return an xr.Dataset like the following:
<xarray.Dataset>
Dimensions: (station_id: 4)
Coordinates:
* station_id (station_id) int64 0 1 2 3
Data variables:
mean_squared_error (station_id) float64 0.4411 0.183 0.06754 0.9662
I have tried using the reduce function
from sklearn.metrics import mean_squared_error
original.reduce(mean_squared_error, dim="time")
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-243-51111f05437b> in <module>
----> 1 original.reduce(mean_squared_error, dim="time")
~/miniconda3/envs/ml/lib/python3.8/site-packages/xarray/core/dataset.py in reduce(self, func, dim, keep_attrs, keepdims, numeric_only, **kwargs)
4915 # the former is often more efficient
4916 reduce_dims = None # type: ignore[assignment]
-> 4917 variables[name] = var.reduce(
4918 func,
4919 dim=reduce_dims,
~/miniconda3/envs/ml/lib/python3.8/site-packages/xarray/core/variable.py in reduce(self, func, dim, axis, keep_attrs, keepdims, **kwargs)
1721 )
1722 if axis is not None:
-> 1723 data = func(self.data, axis=axis, **kwargs)
1724 else:
1725 data = func(self.data, **kwargs)
~/miniconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
TypeError: mean_squared_error() got an unexpected keyword argument 'axis'

There is a package called xskillscore, which has a method to calculate the MSE.
pip install xskillscore
xskillscore.mse(original.obs, original.sim, 'time')

I believe this would work :
np.sqrt(np.square(original["sim"] - original["obs"]).mean(dim="time"))

One solution does not use the internal functions of xarray, but instead requires you to loop over all of your dimension station_id.
from collections import defaultdict
# calculate error metric
out = defaultdict(list)
for sid in original.station_id.values:
data = original.sel(station_id=sid)
orig_err = np.sqrt(mean_squared_error(data["obs"], data["sim"]))
out["original"].append(orig_err)
out["station_id"].append(sid)
rmse = pd.DataFrame(out).set_index("station_id").to_xarray()
This gives you the solution but does not use the internal broadcasting features of xarray and so would struggle with larger datasets.

Related

Python: why do I get an error when I try to interpolate an xarray between dates?

I am trying to interpolate the values of an xarray called pop
pop
I am using the function xarray.interp
dates = pd.date_range('1990-01-01', '2020-01-01', freq='1Y')
popI = pop.interp(time=dates, kwargs={"fill_value": "extrapolate"})
but I get the following error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-75-1393bc257da7> in <module>
----> 1 popI = pop.interp(time=dates, kwargs={"fill_value": "extrapolate"})
/usr/lib/python3/dist-packages/xarray/core/dataset.py in interp(self, coords, method, assume_sorted, kwargs, method_non_numeric, **coords_kwargs)
3163 if method in ["linear", "nearest"]:
3164 for k, v in validated_indexers.items():
-> 3165 obj, newidx = missing._localize(obj, {k: v})
3166 validated_indexers[k] = newidx[k]
3167
/usr/lib/python3/dist-packages/xarray/core/missing.py in _localize(var, indexes_coords)
561 indexes = {}
562 for dim, [x, new_x] in indexes_coords.items():
--> 563 minval = np.nanmin(new_x.values)
564 maxval = np.nanmax(new_x.values)
565 index = x.to_index()
<__array_function__ internals> in nanmin(*args, **kwargs)
/usr/lib/python3/dist-packages/numpy/lib/nanfunctions.py in nanmin(a, axis, out, keepdims)
319 # which do not implement isnan (gh-9009), or fmin correctly (gh-8975)
320 res = np.fmin.reduce(a, axis=axis, out=out, **kwargs)
--> 321 if np.isnan(res).any():
322 warnings.warn("All-NaN slice encountered", RuntimeWarning,
323 stacklevel=3)
TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
You're calling interp on a Dataset, which will always apply functions to all data variables. One of your data variable is a string array mollewide. This can't be interpolated. So you can either set this as a coordinate:
popI = pop.set_coords('mollewide').interp(time=dates, kwargs={"fill_value": "extrapolate"})
or you can only operate on the popDensity data variable:
popI = pop["popDensity"].interp(time=dates, kwargs={"fill_value": "extrapolate"})

Cannot use print function and mean function from numpy

I am taking a course on Udacity called Intro into Data Analysis and I am trying to run this code but I keep getting an error. I am using Python3. Thanks in advance. In the tutorial videos that were explaing the code and the course everything was working fine ( I assumed because it is a different version of Python). I tried many things but I still don't seem to be able to make it work.
%pylab inline
import matplotlib.pyplot as plt
import numpy as np
def describe_data(data):
print ('Mean:', np.mean(data))
print ('Standard deviation:', np.std(data))
print ('Minimum:', np.min(data))
print ('Maximum:', np.max(data))
plt.hist(data)
describe_data(total_minutes_by_account.values())
This is the error:
Populating the interactive namespace from numpy and matplotlib
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-34-669ffc75246c> in <module>
14 plt.hist(data)
15
---> 16 describe_data(total_minutes_by_account.values())
<ipython-input-34-669ffc75246c> in describe_data(data)
8 # Summarize the given data
9 def describe_data(data):
---> 10 print ('Mean:', np.mean(data))
11 print ('Standard deviation:', np.std(data))
12 print ('Minimum:', np.min(data))
<__array_function__ internals> in mean(*args, **kwargs)
~\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py in mean(a, axis, dtype, out, keepdims, where)
3417 return mean(axis=axis, dtype=dtype, out=out, **kwargs)
3418
-> 3419 return _methods._mean(a, axis=axis, dtype=dtype,
3420 out=out, **kwargs)
3421
~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims, where)
188 ret = ret.dtype.type(ret / rcount)
189 else:
--> 190 ret = ret / rcount
191
192 return ret
TypeError: unsupported operand type(s) for /: 'dict_values' and 'int'
I supposed "total_minutes_by_account" is a dataframe. So you can do it in the following way.
import matplotlib.pyplot as plt
import numpy as np
def describe_data(data):
print ('Mean:', np.mean(data))
print ('Standard deviation:', np.std(data))
print ('Minimum:', np.min(data))
print ('Maximum:', np.max(data))
plt.hist(data)
describe_data(total_minutes_by_account.values.tolist())
You need to convert your dataframe values to list before performning any numpy operations.

Pandas df.mean() throws TypeError: 'NoneType' object is not callable or returns empty Series

The title says it all.
df = pd.DataFrame({"A":np.array([1,2,3,4]),"B":np.array([1,2,3,4])})
df_mean = df.mean(axis=0)
print(df_mean)
The code above outputs an empty series object:
Series([], dtype: float64)
Using df.mean() on a dataframe filled with MNIST data throws the following stacktrace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-42-bab38039484e> in <module>
2
3 X_train_class, y_train_class, X_valid_class, \
----> 4 y_valid_class, X_test_class, y_test_class = prepare_load_classification_data()
5 X_train_class.mean()
6 # ebm = ExplainableBoostingClassifier()
<ipython-input-37-b1dcfdd01adc> in prepare_load_classification_data()
45 train_features, train_labels, dev_features, \
46 dev_labels, test_features, test_labels = load_data()
---> 47 feature_mean, label_mean = train_features.mean(axis=0), train_labels.mean(axis=0)
48
49 train_features = pd.DataFrame(data=np.where(train_features > feature_mean, 1, 0), columns=FEATURE_NAMES)
c:\users\fmijs\anaconda3\lib\site-packages\pandas\core\generic.py in mean(self, axis, skipna, level, numeric_only, **kwargs)
11107 )
11108 def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
> 11109 return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs)
11110
11111 # pandas\core\generic.py:10924: error: Cannot assign to a method
c:\users\fmijs\anaconda3\lib\site-packages\pandas\core\generic.py in mean(self, axis, skipna, level, numeric_only, **kwargs)
10718 def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
10719 return self._stat_function(
> 10720 "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs
10721 )
10722
c:\users\fmijs\anaconda3\lib\site-packages\pandas\core\generic.py in _stat_function(self, name, func, axis, skipna, level, numeric_only, **kwargs)
10703 return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
10704 return self._reduce(
> 10705 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
10706 )
10707
c:\users\fmijs\anaconda3\lib\site-packages\pandas\core\series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
4150 )
4151 with np.errstate(all="ignore"):
-> 4152 return op(delegate, skipna=skipna, **kwds)
4153
4154 def _reindex_indexer(self, new_index, indexer, copy):
c:\users\fmijs\anaconda3\lib\site-packages\pandas\core\nanops.py in _f(*args, **kwargs)
69 try:
70 with np.errstate(invalid="ignore"):
---> 71 return f(*args, **kwargs)
72 except ValueError as e:
73 # we want to transform an object array
c:\users\fmijs\anaconda3\lib\site-packages\pandas\core\nanops.py in f(values, axis, skipna, **kwds)
122 # TypeError if called
123 kwds.pop("mask", None)
--> 124 result = bn_func(values, axis=axis, **kwds)
125
126 # prefer to treat inf/-inf as NA, but must compute the func
TypeError: 'NoneType' object is not callable
It appears to be related to a somehow corrupted instalation of pandas or numpy but after reinstalling both downgrading or starting a new Conda environment the issues still remain. Any help would be greatly apreciated!
I ran it with pandas 1.1.3 and numpy 1.19.2 and worked.
I ran it also with pandas 1.2.3 and numpy 1.19.5 in a Jupyter and worked.
I updated all and ran it with pandas 1.2.4 and numpy 1.20.2 and it worked.
So either it is because of numpy or the reason is something different.
Do you have really just this code? Or is there other code that might interfere with your snippet?
for me, the issue was caused by importing pandas before importing numpy
so instead of:
import pandas as pd
import numpy as np
i changed it to
import numpy as np
import pandas as pd
and it fixed the issue

Why is statsmodels throwing an IndedxError when I try to fit a linear mixed-effect model?

Given the code:
import statsmodels.api as sm
import statsmodels.formula.api as smf
df.reset_index(drop=True, inplace=True)
display(df.describe())
md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
mdf = md.fit()
Where df is a pandas.DataFrame, I get the following error out of smf.mixedlm:
IndexError Traceback (most recent call last)
<ipython-input-34-5373fe9b774a> in <module>()
4 df.reset_index(drop=True, inplace=True)
5 display(df.describe())
----> 6 md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
7 # mdf = md.fit()
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in from_formula(cls, formula, data, re_formula, subset, *args, **kwargs)
651 subset=None,
652 exog_re=exog_re,
--> 653 *args, **kwargs)
654
655 # expand re names to account for pairs of RE
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, *args, **kwargs)
148 kwargs.update({'missing_idx': missing_idx,
149 'missing': missing})
--> 150 mod = cls(endog, exog, *args, **kwargs)
151 mod.formula = formula
152
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in __init__(self, endog, exog, groups, exog_re, use_sqrt, missing, **kwargs)
537
538 # Split the data by groups
--> 539 self.endog_li = self.group_list(self.endog)
540 self.exog_li = self.group_list(self.exog)
541 self.exog_re_li = self.group_list(self.exog_re)
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in group_list(self, array)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in <listcomp>(.0)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
IndexError: index 7214 is out of bounds for axis 1 with size 7214
Why is this error occurring? len(df) reports that there are 7296 rows, so there should be no issue indexing the 7214th, and the explicit re-indexing ensures that the indices span from zero to 7295.
You may download df here to fiddle around with it if you'd like.
You have 82 null values in iscorr:
>>> df.iscorr.isnull().sum()
82
Drop them and you will be fine:
df = df[df.iscorr.notnull()]
Per the function's docstring:
Notes
------
`data` must define __getitem__ with the keys in the formula
terms args and kwargs are passed on to the model
instantiation. E.g., a numpy structured or rec array, a
dictionary, or a pandas DataFrame.
If `re_formula` is not provided, the default is a random
intercept for each group.
This method currently does not correctly handle missing
values, so missing values should be explicitly dropped from
the DataFrame before calling this method.
"""
Output:
>>> mdf.params
Intercept 0.032000
iscorr[T.True] 0.030670
Intercept RE -0.057462

Can pandas groupby transform a DataFrame into a Series?

I would like to use pandas and statsmodels to fit a linear model on subsets of a dataframe and return the predicted values. However, I am having trouble figuring out the right pandas idiom to use. Here is what I am trying to do:
import pandas as pd
import statsmodels.formula.api as sm
import seaborn as sns
tips = sns.load_dataset("tips")
def fit_predict(df):
m = sm.ols("tip ~ total_bill", df).fit()
return pd.Series(m.predict(df), index=df.index)
tips["predicted_tip"] = tips.groupby("day").transform(fit_predict)
This raises the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-139-b3d2575e2def> in <module>()
----> 1 tips["predicted_tip"] = tips.groupby("day").transform(fit_predict)
/Users/mwaskom/anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in transform(self, func, *args, **kwargs)
3033 return self._transform_general(func, *args, **kwargs)
3034 except:
-> 3035 return self._transform_general(func, *args, **kwargs)
3036
3037 # a reduction transform
/Users/mwaskom/anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in _transform_general(self, func, *args, **kwargs)
2988 group.T.values[:] = res
2989 else:
-> 2990 group.values[:] = res
2991
2992 applied.append(group)
ValueError: could not broadcast input array from shape (62) into shape (62,6)
The error makes sense in that I think .transform wants to map a DataFrame to a DataFrame. But is there a way to do a groupby operation on a DataFrame, pass each chunk into a function that reduces it to a Series (with the same index), and then combine the resulting Series into something that can be inserted into the original dataframe?
The top part here is the same, I'm just using a toy dataset b/c I'm behind a firewall.
tips = pd.DataFrame({ 'day':list('MMMFFF'), 'tip':range(6),
'total_bill':[10,40,20,80,50,40] })
def fit_predict(df):
m = sm.ols("tip ~ total_bill", df).fit()
return pd.Series(m.predict(df), index=df.index)
If you change 'transform' to 'apply', you'll get:
tips.groupby("day").apply(fit_predict)
day
F 3 2.923077
4 4.307692
5 4.769231
M 0 0.714286
1 1.357143
2 0.928571
That's not quite what you want, but if you drop level=0, you can proceed as desired:
tips['predicted'] = tips.groupby("day").apply(fit_predict).reset_index(level=0,drop=True)
day tip total_bill predicted
0 M 0 10 0.714286
1 M 1 40 1.357143
2 M 2 20 0.928571
3 F 3 80 2.923077
4 F 4 50 4.307692
5 F 5 40 4.769231

Categories

Resources