Missing values in groupby apply function - python

I have a large dataset [time,lat,lon]. I want to regress a variable against time for each point in the grid, but using a for loop takes forever. I found that grouping the dataarray by lat/lon and applying a function reduces the computation time by a lot. I'm applying a function to a dataarray [time,lat,lon] that has been grouped by lat/lon. My end result would be [lat,lon]. Some grid points don't have any data all (np.nan). I want the function to return np.nan for such grid point, but I'm getting the following error:
SVD did not converge in Linear Least Squares
Here's the code:
hus = ifile.hus
#stack lat and lon into a single dimension called allpoints
stacked = hus.stack(allpoints=['lat','lon'])
# define a function to compute a linear trend of a timeseries
def linear_trend(x):
x = x.dropna(dim='time')
if len(x)==0:
return xr.DataArray(np.nan)
else:
time = np.arange(len(x))
pf = np.polyfit(time, x, 1)
return xr.DataArray(pf[0])
trend = stacked.groupby('allpoints').apply(linear_trend)
trend_unstacked = trend.unstack('allpoints')
trend_unstacked
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-9-4e885325e9c4> in <module>
13
14 # apply the function over allpoints to calculate the trend at each point
---> 15 trend = t.groupby('allpoints').apply(linear_trend)
16 trend_unstacked = trend.unstack('allpoints')
17
~/anaconda3/lib/python3.7/site-packages/xarray/core/groupby.py in apply(self, func, shortcut, args, **kwargs)
824 stacklevel=2,
825 )
--> 826 return self.map(func, shortcut=shortcut, args=args, **kwargs)
827
828 def _combine(self, applied, restore_coord_dims=False, shortcut=False):
~/anaconda3/lib/python3.7/site-packages/xarray/core/groupby.py in map(self, func, shortcut, args, **kwargs)
809 grouped = self._iter_grouped()
810 applied = (maybe_wrap_array(arr, func(arr, *args, **kwargs)) for arr in grouped)
--> 811 return self._combine(applied, shortcut=shortcut)
812
813 def apply(self, func, shortcut=False, args=(), **kwargs):
~/anaconda3/lib/python3.7/site-packages/xarray/core/groupby.py in _combine(self, applied, restore_coord_dims, shortcut)
833 combined = self._concat_shortcut(applied, dim, positions)
834 else:
--> 835 combined = concat(applied, dim)
836 combined = _maybe_reorder(combined, dim, positions)
837
~/anaconda3/lib/python3.7/site-packages/xarray/core/concat.py in concat(objs, dim, data_vars, coords, compat, positions, fill_value, join)
133 "objects, got %s" % type(first_obj)
134 )
--> 135 return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)
136
137
~/anaconda3/lib/python3.7/site-packages/xarray/core/concat.py in _dataarray_concat(arrays, dim, data_vars, coords, compat, positions, fill_value, join)
427 join="outer",
428 ):
--> 429 arrays = list(arrays)
430
431 if data_vars != "all":
~/anaconda3/lib/python3.7/site-packages/xarray/core/groupby.py in <genexpr>(.0)
808 else:
809 grouped = self._iter_grouped()
--> 810 applied = (maybe_wrap_array(arr, func(arr, *args, **kwargs)) for arr in grouped)
811 return self._combine(applied, shortcut=shortcut)
812
<ipython-input-9-4e885325e9c4> in linear_trend(x)
8 else:
9 time = np.arange(len(y))
---> 10 pf = np.polyfit(time, x, 1)
11 # we need to return a dataarray or else xarray's groupby won't be happy
12 return xr.DataArray(pf[0])
<__array_function__ internals> in polyfit(*args, **kwargs)
~/anaconda3/lib/python3.7/site-packages/numpy/lib/polynomial.py in polyfit(x, y, deg, rcond, full, w, cov)
603 raise TypeError("expected 1D or 2D array for y")
604 if x.shape[0] != y.shape[0]:
--> 605 raise TypeError("expected x and y to have same length")
606
607 # set rcond
TypeError: expected x and y to have same length
​

Related

Healpy mollview() ValueError for colormap

I need to learn how to use Healpy and so I was trying to reproduce the results of the basic tutorial. I use Anaconda on Ubuntu 22.04 and I think I have all the pre-requisites (I have Python 3.9.13, Numpy, Matplotlib, Astropy, python3-dev and python-dev-is-python3 installed).
I have tried many variations of what is shown in the tutorial notebook, including a literal copy+paste of the code, I've tried to do this in Ipython on terminal, on a jupyter notebook, on Spyder, I've tried to include the %matplotlib inline (after importing matplotlib) in all of these options (I've tried not to include in all of them too), and in all situations I end up with the exact same error message (full error message in the end of the post):
ValueError: Passing a Normalize instance simultaneously with vmin/vmax
is not supported. Please pass vmin/vmax directly to the norm when
creating it.
Everything works except for the plot. I've tried setting min and max in the hp.mollview() command according to the documentation, but it didn't work too. It seems like a bug to me, so I thought about creating an issue ticket on github, but honestly the tutorial is very updated and I don't think this kind of bug would go unnoticed, so I'm thinking I missed some minor detail and I hope someone in here can help me identify what it is. In the meantime, I'll probably try to learn some other version of Healpix.
Here is the full error message when I run the code in a jupyter notebook (by the way, sorry if my question is not very well organized, this is my first post):
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) Cell In [5], line 2
1 m = np.arange(NPIX)
----> 2 hp.mollview(m, title="Mollview image RING")
3 hp.graticule()
File ~/anaconda3/lib/python3.9/site-packages/healpy/visufunc.py:250,
in mollview(map, fig, rot, coord, unit, xsize, title, nest, min, max,
flip, remove_dip, remove_mono, gal_cut, format, format2, cbar, cmap,
badcolor, bgcolor, notext, norm, hold, reuse_axes, margins, sub,
nlocs, return_projected_map)
246 elif remove_mono:
247 map = pixelfunc.remove_monopole(
248 map, gal_cut=gal_cut, nest=nest, copy=True, verbose=True
249 )
--> 250 img = ax.projmap(
251 map,
252 nest=nest,
253 xsize=xsize,
254 coord=coord,
255 vmin=min,
256 vmax=max,
257 cmap=cmap,
258 badcolor=badcolor,
259 bgcolor=bgcolor,
260 norm=norm,
261 )
262 if cbar:
263 im = ax.get_images()[0]
File ~/anaconda3/lib/python3.9/site-packages/healpy/projaxes.py:736,
in HpxMollweideAxes.projmap(self, map, nest, **kwds)
734 nside = pixelfunc.npix2nside(pixelfunc.get_map_size(map))
735 f = lambda x, y, z: pixelfunc.vec2pix(nside, x, y, z, nest=nest)
--> 736 return super(HpxMollweideAxes, self).projmap(map, f, **kwds)
File ~/anaconda3/lib/python3.9/site-packages/healpy/projaxes.py:726,
in MollweideAxes.projmap(self, map, vec2pix_func, xsize, **kwds)
724 def projmap(self, map, vec2pix_func, xsize=800, **kwds):
725 self.proj.set_proj_plane_info(xsize=xsize)
--> 726 img = super(MollweideAxes, self).projmap(map, vec2pix_func, **kwds)
727 self.set_xlim(-2.01, 2.01)
728 self.set_ylim(-1.01, 1.01)
File ~/anaconda3/lib/python3.9/site-packages/healpy/projaxes.py:202,
in SphericalProjAxes.projmap(self, map, vec2pix_func, vmin, vmax,
badval, badcolor, bgcolor, cmap, norm, rot, coord, **kwds)
200 ext = self.proj.get_extent()
201 img = np.ma.masked_values(img, badval)
--> 202 aximg = self.imshow(
203 img,
204 extent=ext,
205 cmap=cm,
206 norm=nn,
207 interpolation="nearest",
208 origin="lower",
209 vmin=vmin,
210 vmax=vmax,
211 **kwds
212 )
213 xmin, xmax, ymin, ymax = self.proj.get_extent()
214 self.set_xlim(xmin, xmax)
File
~/anaconda3/lib/python3.9/site-packages/matplotlib/_api/deprecation.py:454,
in make_keyword_only..wrapper(*args, **kwargs)
448 if len(args) > name_idx:
449 warn_deprecated(
450 since, message="Passing the %(name)s %(obj_type)s "
451 "positionally is deprecated since Matplotlib %(since)s; the "
452 "parameter will become keyword-only %(removal)s.",
453 name=name, obj_type=f"parameter of {func.name}()")
--> 454 return func(*args, **kwargs)
File
~/anaconda3/lib/python3.9/site-packages/matplotlib/init.py:1423,
in _preprocess_data..inner(ax, data, *args, **kwargs) 1420
#functools.wraps(func) 1421 def inner(ax, *args, data=None,
**kwargs): 1422 if data is None:
-> 1423 return func(ax, *map(sanitize_sequence, args), **kwargs) 1425 bound = new_sig.bind(ax, *args, **kwargs) 1426 auto_label = (bound.arguments.get(label_namer) 1427
or bound.kwargs.get(label_namer))
File
~/anaconda3/lib/python3.9/site-packages/matplotlib/axes/_axes.py:5577,
in Axes.imshow(self, X, cmap, norm, aspect, interpolation, alpha,
vmin, vmax, origin, extent, interpolation_stage, filternorm,
filterrad, resample, url, **kwargs) 5574 if im.get_clip_path() is
None: 5575 # image does not already have clipping set, clip to
axes patch 5576 im.set_clip_path(self.patch)
-> 5577 im._scale_norm(norm, vmin, vmax) 5578 im.set_url(url) 5580 # update ax.dataLim, and, if autoscaling, set viewLim 5581 #
to tightly fit the image, regardless of dataLim.
File ~/anaconda3/lib/python3.9/site-packages/matplotlib/cm.py:405, in
ScalarMappable._scale_norm(self, norm, vmin, vmax)
403 self.set_clim(vmin, vmax)
404 if isinstance(norm, colors.Normalize):
--> 405 raise ValueError(
406 "Passing a Normalize instance simultaneously with "
407 "vmin/vmax is not supported. Please pass vmin/vmax "
408 "directly to the norm when creating it.")
410 # always resolve the autoscaling so we have concrete limits
411 # rather than deferring to draw time.
412 self.autoscale_None()
ValueError: Passing a Normalize instance simultaneously with vmin/vmax
is not supported. Please pass vmin/vmax directly to the norm when
creating it.

Constrained Optimization Problem in Scipy: boolean index did not match indexed array along dimension 0

I'm having a constrained optimization problem, which i want to solve using the scipy.optimize package.
from scipy import optimize as opt
import numpy as np
def f(x):
return (x[0]-5)**2 + (x[1]-6)**2
#Bounds and Linear Constraints
bounds = opt.Bounds([0,0],[np.inf, np.inf])
lin_const = opt.LinearConstraint([[1,2],[0,0]], [-np.inf, 0], [4,0])
#Nonlinear Constraints, Jacobian and Hessian
def cons_f(x):
return [x[0]**2 - 4, np.exp(-x[0]) - 1]
def cons_J(x):
return [[2*x[0], 0], [-np.exp(-x[0])]]
def cons_H(x, v):
return v[0]*np.array([[2,0], [0,0]]) + v[1]*np.array([[np.exp(-x[0]), 0], [0,0]])
nonlin_const = opt.NonlinearConstraint(cons_f, -np.inf, 1, jac=cons_J, hess=cons_H)
#Solving the optimization problem
x0 = np.array([0.50, 0.75])
res = opt.minimize(f, x0, method='trust-constr',jac="2-point", hess=opt.SR1(), bounds=bounds, constraints=[lin_const, nonlin_const], options={'verbose': 1})
print(res.x)
I followed the Scipy doc closely but I get the typical numpy boolean index did not match indexed array along dimension 0; dimension is 1 but corresponding boolean dimension is 2 error message, what am I missing? Thanks for your help!
Here is the full Error Message:
IndexError Traceback (most recent call last)
Input In [32], in <cell line: 2>()
1 x0 = np.array([0.50, 0.75])
----> 2 res = opt.minimize(f, x0, method='trust-constr',jac=cons_J, hess=opt.SR1(), bounds=bounds, constraints=[lin_const, nonlin_const], options={'verbose': 1})
3 print(res.x)
File ~\anaconda3\envs\choquetclassifier\lib\site-packages\scipy\optimize\_minimize.py:634, in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
631 return _minimize_slsqp(fun, x0, args, jac, bounds,
632 constraints, callback=callback, **options)
633 elif meth == 'trust-constr':
--> 634 return _minimize_trustregion_constr(fun, x0, args, jac, hess, hessp,
635 bounds, constraints,
636 callback=callback, **options)
637 elif meth == 'dogleg':
638 return _minimize_dogleg(fun, x0, args, jac, hess,
639 callback=callback, **options)
File ~\anaconda3\envs\choquetclassifier\lib\site-packages\scipy\optimize\_trustregion_constr\minimize_trustregion_constr.py:361, in _minimize_trustregion_constr(fun, x0, args, grad, hess, hessp, bounds, constraints, xtol, gtol, barrier_tol, sparse_jacobian, callback, maxiter, verbose, finite_diff_rel_step, initial_constr_penalty, initial_tr_radius, initial_barrier_parameter, initial_barrier_tolerance, factorization_method, disp)
357 prepared_constraints.append(PreparedConstraint(bounds, x0,
358 sparse_jacobian))
360 # Concatenate initial constraints to the canonical form.
--> 361 c_eq0, c_ineq0, J_eq0, J_ineq0 = initial_constraints_as_canonical(
362 n_vars, prepared_constraints, sparse_jacobian)
364 # Prepare all canonical constraints and concatenate it into one.
365 canonical_all = [CanonicalConstraint.from_PreparedConstraint(c)
366 for c in prepared_constraints]
File ~\anaconda3\envs\choquetclassifier\lib\site-packages\scipy\optimize\_trustregion_constr\canonical_constraint.py:352, in initial_constraints_as_canonical(n, prepared_constraints, sparse_jacobian)
350 finite_ub = ub < np.inf
351 c_ineq.append(f[finite_ub] - ub[finite_ub])
--> 352 J_ineq.append(J[finite_ub])
353 elif np.all(ub == np.inf):
354 finite_lb = lb > -np.inf
IndexError: boolean index did not match indexed array along dimension 0; dimension is 1 but corresponding boolean dimension is 2

ValueError: RGBA values should be within 0-1 range when plotting scatter plot

I am attempting to generate a scatter plot to show data before and after the PCA transform, similar to this tutorial.
To do this, I am running the following code:
fig, axes = plt.subplots(1,2)
axes[0].scatter(X.iloc[:,0], X.iloc[:,1], c=y)
axes[0].set_xlabel('x1')
axes[0].set_ylabel('x2')
axes[0].set_title('Before PCA')
axes[1].scatter(X_new[:,0], X_new[:,1], c=y)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('After PCA')
plt.show()
Which is causing this error to appear:
ValueError: RGBA values should be within 0-1 range
X is the preprocessed matrix of features, which contains 196 samples and 59 features. Whereas y is the dependent variable and contains two classes [0, 1].
Here is the full error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-109-2c4f74ddce3f> in <module>
1 fig, axes = plt.subplots(1,2)
----> 2 axes[0].scatter(X.iloc[:,0], X.iloc[:,1], c=y)
3 axes[0].set_xlabel('x1')
4 axes[0].set_ylabel('x2')
5 axes[0].set_title('Before PCA')
~/anaconda3/lib/python3.7/site-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs)
1597 def inner(ax, *args, data=None, **kwargs):
1598 if data is None:
-> 1599 return func(ax, *map(sanitize_sequence, args), **kwargs)
1600
1601 bound = new_sig.bind(ax, *args, **kwargs)
~/anaconda3/lib/python3.7/site-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, **kwargs)
4495 offsets=offsets,
4496 transOffset=kwargs.pop('transform', self.transData),
-> 4497 alpha=alpha
4498 )
4499 collection.set_transform(mtransforms.IdentityTransform())
~/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py in __init__(self, paths, sizes, **kwargs)
881 """
882
--> 883 Collection.__init__(self, **kwargs)
884 self.set_paths(paths)
885 self.set_sizes(sizes)
~/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py in __init__(self, edgecolors, facecolors, linewidths, linestyles, capstyle, joinstyle, antialiaseds, offsets, transOffset, norm, cmap, pickradius, hatch, urls, offset_position, zorder, **kwargs)
125
126 self._hatch_color = mcolors.to_rgba(mpl.rcParams['hatch.color'])
--> 127 self.set_facecolor(facecolors)
128 self.set_edgecolor(edgecolors)
129 self.set_linewidth(linewidths)
~/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py in set_facecolor(self, c)
676 """
677 self._original_facecolor = c
--> 678 self._set_facecolor(c)
679
680 def get_facecolor(self):
~/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py in _set_facecolor(self, c)
659 except AttributeError:
660 pass
--> 661 self._facecolors = mcolors.to_rgba_array(c, self._alpha)
662 self.stale = True
663
~/anaconda3/lib/python3.7/site-packages/matplotlib/colors.py in to_rgba_array(c, alpha)
277 result[mask] = 0
278 if np.any((result < 0) | (result > 1)):
--> 279 raise ValueError("RGBA values should be within 0-1 range")
280 return result
281 # Handle single values.
ValueError: RGBA values should be within 0-1 range
I am unsure what is causing this error and would appreciate help in figuring this out. Thanks!
The c= parameter of ax.scatter can be given in several ways:
A scalar or sequence of n numbers to be mapped to colors using cmap and norm. So a single number, or a list-like 1D sequence of numbers.
A 2D array in which the rows are RGB or RGBA. E.g. something like [[1,0,0], [0,0,1]]. All these values need to be between 0 and 1. Moreover, there should be either 3 (for RGB) or 4 (for RGBA) values per entry.
A sequence of colors of length n. E.g. ["red", "#B789C0", "turquoise"]
A single color format string. E.g. "cornflowerblue".
Now, when an array of numbers is given, to be able to distinguish between the first and the second case, matplotlib just looks at the array dimension. If it is 1D, matplotlib assumes the first case. For 2D, it assumes the second case. Note that also an Nx1 or an 1xN array is considered 2D. You can use np.squeeze() to "squeeze out" the dummy second dimension.

Keep getting ValueError: Shape of passed values is (4474, 10), indices imply (14084, 10)

First off, thanks in advance if you can help puzzle this out! I'm trying to balance some customer data for my model. My targets are all 1s and 0s, and the 0s are overwhelmingly abundant. So I created a counter that will start to delete the 0 rows once they surpass the number of 1 rows. But at the very end of my code, when I create the np.delete to get those extra rows off my dataset I keep getting this error
I don't really know what to try, because I don't even understand what the error is telling me
import pandas as pd
import numpy as np
from sklearn import preprocessing
#%%
#Loading the Raw Data
raw_csv_data= pd.read_csv('Audiobooks-data_raw.csv')
print(display(raw_csv_data.head(20)))
#%%
df=raw_csv_data.copy()
print(display(df.head(20)))
#%%
print(df.info())
#%%
#Separate the Targets from the dataset
inputs_all= df.loc[:,'Book length (mins)_overall':'Last visited minus Purchase date']
targets_all= df['Targets']
print(display(inputs_all.head()))
print(display(targets_all.head()))
#%%
#Shuffling the Data to prep for balancing
shuffled_indices= np.arange(inputs_all.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs= inputs_all.iloc[shuffled_indices]
shuffled_targets= targets_all[shuffled_indices]
#%%
#Balance the Dataset
#There are significantly more 0's than 1's in our target.
#We want a good accurate model
print(inputs_all.shape)
print(targets_all.shape)
#%%
num_one_targets= int(np.sum(targets_all))
zero_targets_counter= 0
indices_to_remove= []
print(num_one_targets)
#%%
for i in range(targets_all.shape[0]):
if targets_all[i]==0:
zero_targets_counter +=1
if zero_targets_counter> num_one_targets:
indices_to_remove.append(i)
#%%
inputs_all_balanced= np.delete(inputs_all, indices_to_remove, axis=0)
targets_all_balanced= np.delete(targets_all, indices_to_remove, axis=0)
Everything works except when I try to group my balanced datasets and delete the excess 0 rows. Here is the error:
ValueError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1652
-> 1653 mgr = BlockManager(blocks, axes)
1654 mgr._consolidate_inplace()
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
113 if do_integrity_check:
--> 114 self._verify_integrity()
115
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
310 if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
--> 311 construction_error(tot_items, block.shape[1:], self.axes)
312 if len(self.items) != tot_items:
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in construction_error(tot_items, block_shape, axes, e)
1690 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 1691 passed, implied))
1692
ValueError: Shape of passed values is (4474, 10), indices imply (14084, 10)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
in
----> 1 inputs_all_balanced= np.delete(inputs_all, indices_to_remove, axis=0)
2 targets_all_balanced= np.delete(targets_all, indices_to_remove, axis=0)
~\Anaconda3\lib\site-packages\numpy\lib\function_base.py in delete(arr, obj, axis)
4419
4420 if wrap:
-> 4421 return wrap(new)
4422 else:
4423 return new
~\Anaconda3\lib\site-packages\pandas\core\generic.py in __array_wrap__(self, result, context)
1907 def __array_wrap__(self, result, context=None):
1908 d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
-> 1909 return self._constructor(result, **d).__finalize__(self)
1910
1911 # ideally we would define this to avoid the getattr checks, but
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
422 else:
423 mgr = init_ndarray(data, index, columns, dtype=dtype,
--> 424 copy=copy)
425
426 # For data is list-like, or Iterable (will consume into list)
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)
165 values = maybe_infer_to_datetimelike(values)
166
--> 167 return create_block_manager_from_blocks([values], [columns, index])
168
169
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1658 blocks = [getattr(b, 'values', b) for b in blocks]
1659 tot_items = sum(b.shape[0] for b in blocks)
-> 1660 construction_error(tot_items, blocks[0].shape[1:], axes, e)
1661
1662
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in construction_error(tot_items, block_shape, axes, e)
1689 raise ValueError("Empty data passed with indices specified.")
1690 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 1691 passed, implied))
1692
1693
ValueError: Shape of passed values is (4474, 10), indices imply (14084, 10)
Try removing rows with pandas drop instead:
inputs_all_balanced = inputs_all.drop(indices_to_remove,axis=0)
targets_all_balanced = targets_all.drop(indices_to_remove,axis=0)

Why is statsmodels throwing an IndedxError when I try to fit a linear mixed-effect model?

Given the code:
import statsmodels.api as sm
import statsmodels.formula.api as smf
df.reset_index(drop=True, inplace=True)
display(df.describe())
md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
mdf = md.fit()
Where df is a pandas.DataFrame, I get the following error out of smf.mixedlm:
IndexError Traceback (most recent call last)
<ipython-input-34-5373fe9b774a> in <module>()
4 df.reset_index(drop=True, inplace=True)
5 display(df.describe())
----> 6 md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
7 # mdf = md.fit()
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in from_formula(cls, formula, data, re_formula, subset, *args, **kwargs)
651 subset=None,
652 exog_re=exog_re,
--> 653 *args, **kwargs)
654
655 # expand re names to account for pairs of RE
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, *args, **kwargs)
148 kwargs.update({'missing_idx': missing_idx,
149 'missing': missing})
--> 150 mod = cls(endog, exog, *args, **kwargs)
151 mod.formula = formula
152
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in __init__(self, endog, exog, groups, exog_re, use_sqrt, missing, **kwargs)
537
538 # Split the data by groups
--> 539 self.endog_li = self.group_list(self.endog)
540 self.exog_li = self.group_list(self.exog)
541 self.exog_re_li = self.group_list(self.exog_re)
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in group_list(self, array)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in <listcomp>(.0)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
IndexError: index 7214 is out of bounds for axis 1 with size 7214
Why is this error occurring? len(df) reports that there are 7296 rows, so there should be no issue indexing the 7214th, and the explicit re-indexing ensures that the indices span from zero to 7295.
You may download df here to fiddle around with it if you'd like.
You have 82 null values in iscorr:
>>> df.iscorr.isnull().sum()
82
Drop them and you will be fine:
df = df[df.iscorr.notnull()]
Per the function's docstring:
Notes
------
`data` must define __getitem__ with the keys in the formula
terms args and kwargs are passed on to the model
instantiation. E.g., a numpy structured or rec array, a
dictionary, or a pandas DataFrame.
If `re_formula` is not provided, the default is a random
intercept for each group.
This method currently does not correctly handle missing
values, so missing values should be explicitly dropped from
the DataFrame before calling this method.
"""
Output:
>>> mdf.params
Intercept 0.032000
iscorr[T.True] 0.030670
Intercept RE -0.057462

Categories

Resources