MixedLM usage for model fit - python

I want fit my sample data to mixed panel regression using MixedLM however for my input sm.MixedLM.from_formula("TOTCOST~YEAR",dat,groups=dat["NI"]) I am facing the following error:
ValueError Traceback (most recent call last)
<ipython-input-40-38fa54e8b448> in <module>()
----> 1 sm.MixedLM.from_formula("TOTCOST~YEAR",dat,groups=dat["NI"])
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\statsmodels\regression\mixed_linear_model.pyc in from_formula(cls, formula, data, re_formula, subset, *args, **kwargs)
651 subset=None,
652 exog_re=exog_re,
--> 653 *args, **kwargs)
654
655 # expand re names to account for pairs of RE
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\statsmodels\base\model.pyc in from_formula(cls, formula, data, subset, *args, **kwargs)
145 (endog, exog), missing_idx = handle_formula_data(data, None, formula,
146 depth=eval_env,
--> 147 missing=missing)
148 kwargs.update({'missing_idx': missing_idx,
149 'missing': missing})
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\statsmodels\formula\formulatools.pyc in handle_formula_data(Y, X, formula, depth, missing)
63 if data_util._is_using_pandas(Y, None):
64 result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65 NA_action=na_action)
66 else:
67 result = dmatrices(formula, Y, depth, return_type='dataframe',
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\patsy\highlevel.pyc in dmatrices(formula_like, data, eval_env, NA_action, return_type)
295 eval_env = EvalEnvironment.capture(eval_env, reference=1)
296 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 297 NA_action, return_type)
298 if lhs.shape[1] == 0:
299 raise PatsyError("model is missing required outcome variables")
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\patsy\highlevel.pyc in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
150 return iter([data])
151 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 152 NA_action)
153 if design_infos is not None:
154 return build_design_matrices(design_infos, data,
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\patsy\highlevel.pyc in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
55 data_iter_maker,
56 eval_env,
---> 57 NA_action)
58 else:
59 return None
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\patsy\build.pyc in design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action)
704 factor_states[factor],
705 num_columns=num_column_counts[factor],
--> 706 categories=None)
707 else:
708 assert factor in cat_levels_contrasts
C:\Users\Harshita Jaiswal\Anaconda2\lib\site-packages\patsy\design_info.pyc in __init__(self, factor, type, state, num_columns, categories)
86 if self.type == "numerical":
87 if not isinstance(num_columns, int):
---> 88 raise ValueError("For numerical factors, num_columns "
89 "must be an int")
90 if categories is not None:
ValueError: For numerical factors, num_columns must be an int
Any help will be appreciated. Thank you

Related

ValueError: Input contains NaN, ... when doing fit_transform() in BERTopic

I want to make BERTopic model with my clustering algorithm (KMeans) and my Vectorizer (Count Vectorizer), but I keep getting this warning and error when I want to do .fit_transform(data) :
Warining:
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/vectorizers/_ctfidf.py:69: RuntimeWarning:
divide by zero encountered in divide
And then, error:
ValueError Traceback (most recent call last)
<ipython-input-104-1f024d22018f> in <module>
----> 1 topics, probs = bert_topic_model.fit_transform(final_df.body)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/_bertopic.py in fit_transform(self, documents, embeddings, y)
368 self._map_representative_docs(original_topics=True)
369 else:
--> 370 self._save_representative_docs(documents)
371
372 self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/_bertopic.py in _save_representative_docs(self, documents)
3000 bow = self.vectorizer_model.transform(selected_docs)
3001 ctfidf = self.ctfidf_model.transform(bow)
-> 3002 sim_matrix = cosine_similarity(ctfidf, self.c_tf_idf_[topic + self._outliers])
3003
3004 # Extract top 3 most representative documents
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/metrics/pairwise.py in cosine_similarity(X, Y, dense_output)
1178 # to avoid recursive import
1179
-> 1180 X, Y = check_pairwise_arrays(X, Y)
1181
1182 X_normalized = normalize(X, copy=True)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/metrics/pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype, accept_sparse, force_all_finite, copy)
144 estimator=estimator)
145 else:
--> 146 X = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
147 copy=copy, force_all_finite=force_all_finite,
148 estimator=estimator)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
648 if sp.issparse(array):
649 _ensure_no_complex_data(array)
--> 650 array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
651 dtype=dtype, copy=copy,
652 force_all_finite=force_all_finite,
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse)
446 % spmatrix.format, stacklevel=2)
447 else:
--> 448 _assert_all_finite(spmatrix.data,
449 allow_nan=force_all_finite == 'allow-nan')
450
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
This is my full code:
features = final_df["body"] # does not have NaN or Infinite values, I have checked 10 times
transformerVectoriser = CountVectorizer(analyzer = 'word', ngram_range = (1, 4), vocabulary = vocab_list)
#my vocab list does not have NaN or Infinite values, I have checked 10 times
cluster_model = KMeans(n_clusters = 50, init='k-means++', max_iter = 1500, random_state=None)
bert_topic_model = BERTopic(hdbscan_model = cluster_model,
vectorizer_model = transformerVectoriser,
verbose = True,
top_n_words = 15)
#final_df.body does not have NaN or Infinite values, I have checked 10 times
topics, probs = bert_topic_model.fit_transform(final_df.body) #ERROR
I really do not know what is the problem, and what is going on.
All values in vocab_list are string values and all values in final_df.body are string values

Dask IndexError: list index out of range

So i have folder called "data" say containing many CSV files
import dask.dataframe as dd
df = dd.read_csv('data/*.csv')
df.head()
df.column_1.mean().compute()
The above lines of code work perfectly and the dask.compute method does its job. But when i add the "include_path_column=True" parameter to the dd.read_cs() function call, i get the following error:
IndexError: list index out of range
When i expand the error i get
IndexError Traceback (most recent call last)
<ipython-input-129-4be67235bebb> in <module>
----> 1 df['H_hp'].mean().compute()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
165 dask.base.compute
166 """
--> 167 (result,) = compute(self, traverse=False, **kwargs)
168 return result
169
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
444 )
445
--> 446 dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
447 keys, postcomputes = [], []
448 for x in collections:
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in collections_to_dsk(collections, optimize_graph, **kwargs)
216 dsk, keys = _extract_graph_and_keys(val)
217 groups[opt] = (dsk, keys)
--> 218 _opt = opt(dsk, keys, **kwargs)
219 _opt_list.append(_opt)
220
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/dataframe/optimize.py in optimize(dsk, keys, **kwargs)
19 dsk = fuse_roots(dsk, keys=flat_keys)
20
---> 21 dsk = ensure_dict(dsk)
22
23 if isinstance(keys, list):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/utils.py in ensure_dict(d)
1030 dd_id = id(dd)
1031 if dd_id not in seen:
-> 1032 result.update(dd)
1033 seen.add(dd_id)
1034 return result
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/dataframe/io/csv.py in __getitem__(self, key)
80
81 if self.paths is not None:
---> 82 path_info = (self.colname, self.paths[i], self.paths)
83 else:
84 path_info = None
IndexError: list index out of range

MemoryError when using pandas_profiling profile_report

I'm trying to profile an excel file, it is a very small data set, only 30 columns and 535 rows, but when I run the profile_report function it stops each time in a different percentage but always has the same message:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-41-283dd2cb2000> in <module>
1 df=pd.read_excel(path_working+'Documents/Information/'+'sample.xlsx')
2 profile = df.profile_report(title='Sample Exploratory')
----> 3 profile.to_file(path_working+'sample.html')
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
276 create_html_assets(output_file)
277
--> 278 data = self.to_html()
279
280 if output_file.suffix != ".html":
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
384
385 """
--> 386 return self.html
387
388 def to_json(self) -> str:
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
199 def html(self):
200 if self._html is None:
--> 201 self._html = self._render_html()
202 return self._html
203
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
306 from pandas_profiling.report.presentation.flavours import HTMLReport
307
--> 308 report = self.report
309
310 disable_progress_bar = not config["progress_bar"].get(bool)
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
193 def report(self):
194 if self._report is None:
--> 195 self._report = get_report_structure(self.description_set)
196 return self._report
197
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
172 def description_set(self):
173 if self._description_set is None:
--> 174 self._description_set = describe_df(
175 self.title, self.df, self.summarizer, self.typeset, self._sample
176 )
~\anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df, summarizer, typeset, sample)
72 total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
73 ) as pbar:
---> 74 series_description = get_series_descriptions(df, summarizer, typeset, pbar)
75
76 pbar.set_postfix_str("Get variable types")
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, summarizer, typeset, pbar)
97 # TODO: use `Pool` for Linux-based systems
98 with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 99 for i, (column, description) in enumerate(
100 executor.imap_unordered(multiprocess_1d, args)
101 ):
~\anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 __next__ = next # XXX
~\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
76 """
77 column, series = args
---> 78 return column, describe_1d(series, summarizer, typeset)
79
80 pool_size = config["pool_size"].get(int)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series, summarizer, typeset)
50 vtype = typeset.detect_type(series)
51
---> 52 return summarizer.summarize(series, dtype=vtype)
53
54
~\anaconda3\lib\site-packages\pandas_profiling\model\summarizer.py in summarize(self, series, dtype)
54 """
55 summarizer_func = compose(self.summary_map.get(dtype, []))
---> 56 _, summary = summarizer_func(series, {"type": dtype})
57 return summary
58
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
17 def func(f, g):
18 def func2(*x):
---> 19 res = g(*x)
20 if type(res) == bool:
21 return f(*x)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in inner(series, summary)
70 if not summary["hashable"]:
71 return series, summary
---> 72 return fn(series, summary)
73
74 return inner
~\anaconda3\lib\site-packages\visions\utils\series_utils.py in inner(series, state, *args, **kwargs)
40 return False
41
---> 42 return fn(series, state, *args, **kwargs)
43
44 return inner
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in describe_numeric_1d(series, summary)
208
209 if chi_squared_threshold > 0.0:
--> 210 stats["chi_squared"] = chi_square(finite_values)
211
212 stats["range"] = stats["max"] - stats["min"]
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_helpers.py in chi_square(values, histogram)
352 def chi_square(values=None, histogram=None):
353 if histogram is None:
--> 354 histogram, _ = np.histogram(values, bins="auto")
355 return dict(chisquare(histogram)._asdict())
356
<__array_function__ internals> in histogram(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
790 a, weights = _ravel_and_check_weights(a, weights)
791
--> 792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
793
794 # Histogram is an integer or a float array depending on the weights.
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
444
445 # bin edges must be computed
--> 446 bin_edges = np.linspace(
447 first_edge, last_edge, n_equal_bins + 1,
448 endpoint=True, dtype=bin_type)
<__array_function__ internals> in linspace(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
126
127 delta = stop - start
--> 128 y = _nx.arange(0, num, dtype=dt).reshape((-1,) + (1,) * ndim(delta))
129 # In-place multiplication y *= delta/div is faster, but prevents the multiplicant
130 # from overriding what class is produced, and thus prevents, e.g. use of Quantities,
MemoryError: Unable to allocate 1.75 EiB for an array with shape (251938683619878560,) and data type float64
I ran the same code in a different python installation and it ran fine.
Thank you all in advance and let me know if you need more information.
This is a bug in numpy.histogram (https://github.com/numpy/numpy/issues/10297), also reported on SO (Numpy histogram extremely slow on small data set).
This error is caused by the call to np.histogram(x, bin='auto'). When input has very large values, the "auto" method can fail while trying to generate an enormous number of bin that cannot fit in ram.
As a workaround, you can remove the large values manually before generating the report.

getting a type error: str argument expected

The code is for matching using record linkage toolkit. When I am setting the comparison it is giving me the TypeError: str argument expected error.
I don't understand what is wrong with the code:
compare = recordlinkage.Compare()
compare.exact('Sex Global', 'Sex')
compare.exact('Age', 'Age (Yrs) at presentation')
compare.string('ADD_LINE_1', 'Address',
method='levenshtein',
threshold=0.7)
compare.string('CITY', 'City',
threshold=0.8)
compare.string('STATE_C', 'State',
threshold=0.85)
features = compare.compute(combinations, phhs_abnormal, hhsc_copy)
traceback:
TypeError Traceback (most recent call last)
<ipython-input-24-d84008a8ba19> in <module>
13 threshold=0.85)
14 features = compare.compute(combinations, phhs_abnormal,
---> 15 hhsc_copy)
~/.local/lib/python3.7/site-packages/recordlinkage/base.py in compute(self, pairs, x, x_link)
863
864 if self.n_jobs == 1:
--> 865 results = self._compute(pairs, x, x_link)
866 elif self.n_jobs > 1:
867 results = self._compute_parallel(
~/.local/lib/python3.7/site-packages/recordlinkage/base.py in _compute(self, pairs, x, x_link)
727 )
728
--> 729 result = feat._compute(data1, data2)
730 features.append((result, feat.label))
731
~/.local/lib/python3.7/site-packages/recordlinkage/base.py in _compute(self, left_on, right_on)
447 numpy.ndarray objects.
448 """
--> 449 result = self._compute_vectorized(*tuple(left_on + right_on))
450
451 return result
~/.local/lib/python3.7/site-packages/recordlinkage/compare.py in _compute_vectorized(self, s_left, s_right)
148 self.method))
149
--> 150 c = str_sim_alg(s_left, s_right)
151
152 if self.threshold is not None:
~/.local/lib/python3.7/site-packages/recordlinkage/algorithms/string.py in levenshtein_similarity(s1, s2)
62 raise err
63
---> 64 return conc.apply(levenshtein_apply)
65
66
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
4040 else:
4041 values = self.astype(object).values
-> 4042 mapped = lib.map_infer(values, f, convert=convert_dtype)
4043
4044 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
~/.local/lib/python3.7/site-packages/recordlinkage/algorithms/string.py in levenshtein_apply(x)
60 return np.nan
61 else:
---> 62 raise err
63
64 return conc.apply(levenshtein_apply)
~/.local/lib/python3.7/site-packages/recordlinkage/algorithms/string.py in levenshtein_apply(x)
54
55 try:
---> 56 return 1 - jellyfish.levenshtein_distance(x[0], x[1]) \
57 / np.max([len(x[0]), len(x[1])])
58 except Exception as err:
TypeError: str argument expected

Possible bug with `xarray.Dataset.groupby()`?

I'm using Xarray version 0.8.0, Python 3.5.1, on Mac OS X El Capitan 10.11.6.
The following code works as expected.
id_data_array = xarray.DataArray([280, 306, 280], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 280
score (index) float64 0.8358 0.7536 0.9495
======
<xarray.Dataset>
Dimensions: (id: 2)
Coordinates:
* id (id) int64 280 306
Data variables:
score (id) int64 2 1
In [ ]:
However, if I change just one little thing, to make the elements of id_data_array all distinct, then there is an error.
Code:
id_data_array = xarray.DataArray([280, 306, 120], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 120
score (index) float64 0.1353 0.0437 0.1687
======
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-92-cc412270ba2e> in <module>()
5 print(score_dataset)
6 print("======")
----> 7 print(score_dataset.groupby("id").count())
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, **kwargs)
44 return self.reduce(func, dim, keep_attrs,
45 numeric_only=numeric_only, allow_lazy=True,
---> 46 **kwargs)
47 return wrapped_func
48
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
605 def reduce_dataset(ds):
606 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 607 return self.apply(reduce_dataset)
608
609 def assign(self, **kwargs):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in apply(self, func, **kwargs)
562 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
563 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 564 combined = self._concat(applied)
565 result = self._maybe_restore_empty_groups(combined)
566 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in _concat(self, applied)
570 concat_dim, positions = self._infer_concat_args(applied_example)
571
--> 572 combined = concat(applied, concat_dim)
573 reordered = _maybe_reorder(combined, concat_dim, positions)
574 return reordered
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
114 raise TypeError('can only concatenate xarray Dataset and DataArray '
115 'objects, got %s' % type(first_obj))
--> 116 return f(objs, dim, data_vars, coords, compat, positions)
117
118
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
276 if coord is not None:
277 # add concat dimension last to ensure that its in the final Dataset
--> 278 result[coord.name] = coord
279
280 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
536 raise NotImplementedError('cannot yet use a dictionary as a key '
537 'to set Dataset values')
--> 538 self.update({key: value})
539
540 def __delitem__(self, key):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in update(self, other, inplace)
1434 dataset.
1435 """
-> 1436 variables, coord_names, dims = dataset_update_method(self, other)
1437
1438 return self._replace_vars_and_dims(variables, coord_names, dims,
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
490 priority_arg = 1
491 indexes = dataset.indexes
--> 492 return merge_core(objs, priority_arg=priority_arg, indexes=indexes)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
371
372 coerced = coerce_pandas_values(objs)
--> 373 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
374 expanded = expand_variable_dicts(aligned)
375
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in deep_align(list_of_variable_maps, join, copy, indexes)
146 out.append(variables)
147
--> 148 aligned = partial_align(*targets, join=join, copy=copy, indexes=indexes)
149
150 for key, aligned_obj in zip(keys, aligned):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in partial_align(*objects, **kwargs)
109 valid_indexers = dict((k, v) for k, v in joined_indexes.items()
110 if k in obj.dims)
--> 111 result.append(obj.reindex(copy=copy, **valid_indexers))
112 return tuple(result)
113
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, **kw_indexers)
1216
1217 variables = alignment.reindex_variables(
-> 1218 self.variables, self.indexes, indexers, method, tolerance, copy=copy)
1219 return self._replace_vars_and_dims(variables)
1220
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in reindex_variables(variables, indexes, indexers, method, tolerance, copy)
218 target = utils.safe_cast_to_index(indexers[name])
219 indexer = index.get_indexer(target, method=method,
--> 220 **get_indexer_kwargs)
221
222 to_shape[name] = len(target)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
2080
2081 if not self.is_unique:
-> 2082 raise InvalidIndexError('Reindexing only valid with uniquely'
2083 ' valued Index objects')
2084
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
To me this seems buggy because if this is the desired behaviour then it would be very strange. Surely, we should include the case when all the elements of the DataArray we're grouping by are distinct?
Update
I've now uninstalled and reinstalled Xarray. The new Xarray is version 0.8.1, and it seems to work fine. So it may indeed be a bug in Xarray 0.8.0.

Categories

Resources