ValueError: Input contains NaN, ... when doing fit_transform() in BERTopic - python

I want to make BERTopic model with my clustering algorithm (KMeans) and my Vectorizer (Count Vectorizer), but I keep getting this warning and error when I want to do .fit_transform(data) :
Warining:
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/vectorizers/_ctfidf.py:69: RuntimeWarning:
divide by zero encountered in divide
And then, error:
ValueError Traceback (most recent call last)
<ipython-input-104-1f024d22018f> in <module>
----> 1 topics, probs = bert_topic_model.fit_transform(final_df.body)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/_bertopic.py in fit_transform(self, documents, embeddings, y)
368 self._map_representative_docs(original_topics=True)
369 else:
--> 370 self._save_representative_docs(documents)
371
372 self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/_bertopic.py in _save_representative_docs(self, documents)
3000 bow = self.vectorizer_model.transform(selected_docs)
3001 ctfidf = self.ctfidf_model.transform(bow)
-> 3002 sim_matrix = cosine_similarity(ctfidf, self.c_tf_idf_[topic + self._outliers])
3003
3004 # Extract top 3 most representative documents
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/metrics/pairwise.py in cosine_similarity(X, Y, dense_output)
1178 # to avoid recursive import
1179
-> 1180 X, Y = check_pairwise_arrays(X, Y)
1181
1182 X_normalized = normalize(X, copy=True)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/metrics/pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype, accept_sparse, force_all_finite, copy)
144 estimator=estimator)
145 else:
--> 146 X = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
147 copy=copy, force_all_finite=force_all_finite,
148 estimator=estimator)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
648 if sp.issparse(array):
649 _ensure_no_complex_data(array)
--> 650 array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
651 dtype=dtype, copy=copy,
652 force_all_finite=force_all_finite,
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse)
446 % spmatrix.format, stacklevel=2)
447 else:
--> 448 _assert_all_finite(spmatrix.data,
449 allow_nan=force_all_finite == 'allow-nan')
450
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
This is my full code:
features = final_df["body"] # does not have NaN or Infinite values, I have checked 10 times
transformerVectoriser = CountVectorizer(analyzer = 'word', ngram_range = (1, 4), vocabulary = vocab_list)
#my vocab list does not have NaN or Infinite values, I have checked 10 times
cluster_model = KMeans(n_clusters = 50, init='k-means++', max_iter = 1500, random_state=None)
bert_topic_model = BERTopic(hdbscan_model = cluster_model,
vectorizer_model = transformerVectoriser,
verbose = True,
top_n_words = 15)
#final_df.body does not have NaN or Infinite values, I have checked 10 times
topics, probs = bert_topic_model.fit_transform(final_df.body) #ERROR
I really do not know what is the problem, and what is going on.
All values in vocab_list are string values and all values in final_df.body are string values

Related

Error during data scaling in pandas data fram

I have a dataset in CSV format. I am trying to perform scaling in my dataset, but I am getting an error. As I understood, I need to convert from 3D to 2D. But I am not sure, how to do that.
Example of my dataset:
63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1
57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0
56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2
44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
57.0,1.0,3.0,150.0,168.0,0.0,0.0,174.0,0.0,1.6,1.0,0.0,3.0,0
48.0,1.0,2.0,110.0,229.0,0.0,0.0,168.0,0.0,1.0,3.0,0.0,7.0,1
54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0
My code:
import pandas as pd
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('processed_cleveland_data.csv')
ss = StandardScaler()
df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
Error:
ValueError
Traceback (most recent call last)
<ipython-input-5-6db223ceefcd> in <module>
4 df = pd.read_csv('processed_cleveland_data.csv')
5 ss = StandardScaler()
----> 6 df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
~\Miniconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
697 if y is None:
698 # fit method of arity 1 (unsupervised transformation)
--> 699 return self.fit(X, **fit_params).transform(X)
700 else:
701 # fit method of arity 2 (supervised transformation)
~\Miniconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~\Miniconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~\Miniconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
614 array = array.astype(dtype, casting="unsafe", copy=False)
615 else:
--> 616 array = np.asarray(array, order=order, dtype=dtype)
617 except ComplexWarning as complex_warning:
618 raise ValueError("Complex data not supported\n"
~\Miniconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
~\Miniconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
1897
1898 def __array__(self, dtype=None) -> np.ndarray:
-> 1899 return np.asarray(self._values, dtype=dtype)
1900
1901 def __array_wrap__(
~\Miniconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: '?'
Use na_values for convert ? to missing values:
df = pd.read_csv('processed_cleveland_data.csv', na_values='?')
#if csv has no header
#df = pd.read_csv('processed_cleveland_data.csv', na_values='?', header=None)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)

min_max_scaler.fit_transform: ValueError: setting an array element with a sequence

I'm using min_max_scaler.fit_transform() to rescale each column in the Dataframe - df.
df[['A', 'B', 'C']] = min_max_scaler.fit_transform(df[['A', 'B', 'C']])
I got the ValueError: setting an array element with a sequence. However, this error only occurs when I process one of my csv files. All others work fine. I don't know where should I start to debug? Can anyone suggest some directions to figure out the issue?
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
569 if y is None:
570 # fit method of arity 1 (unsupervised transformation)
--> 571 return self.fit(X, **fit_params).transform(X)
572 else:
573 # fit method of arity 2 (supervised transformation)
~/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_data.py in fit(self, X, y)
337 # Reset internal state before fitting
338 self._reset()
--> 339 return self.partial_fit(X, y)
340
341 def partial_fit(self, X, y=None):
~/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y)
371 X = check_array(X,
372 estimator=self, dtype=FLOAT_DTYPES,
--> 373 force_all_finite="allow-nan")
374
375 data_min = np.nanmin(X, axis=0)
~/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
529 array = array.astype(dtype, casting="unsafe", copy=False)
530 else:
--> 531 array = np.asarray(array, order=order, dtype=dtype)
532 except ComplexWarning:
533 raise ValueError("Complex data not supported\n"
~/anaconda3/lib/python3.7/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.

JuPyter: Creating Decision Tree, TypeError: '<' not supported between instances of 'str' and 'float''

I'm creating decision tree with JuPyter notebook and when I started creating the decision and putting the features and target class the jupyter give me this error that is found in this cell for the decision tree.dt = c.fit(X_train, y_train)
This is the error.
TypeError
Traceback (most recent call last)
<ipython-input-10-0f9186b7935c> in <module>()
----> 1 dt = c.fit(X_train, y_train)
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
788 sample_weight=sample_weight,
789 check_input=check_input,
--> 790 X_idx_sorted=X_idx_sorted)
791 return self
792
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
138
139 if is_classification:
--> 140 check_classification_targets(y)
141 y = np.copy(y)
142
~\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
167 y : array-like
168 """
--> 169 y_type = type_of_target(y)
170 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
171 'multilabel-indicator', 'multilabel-sequences']:
~\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
286 return 'continuous' + suffix
287
--> 288 if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
289 return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
290 else:
~\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
221 ar = np.asanyarray(ar)
222 if axis is None:
--> 223 return _unique1d(ar, return_index, return_inverse, return_counts)
224 if not (-ar.ndim <= axis < ar.ndim):
225 raise ValueError('Invalid axis kwarg specified for unique')
~\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
281 aux = ar[perm]
282 else:
--> 283 ar.sort()
284 aux = ar
285 flag = np.concatenate(([True], aux[1:] != aux[:-1]))
TypeError: '<' not supported between instances of 'str' and 'float'
I'm confused because my data set feature is clean it is all Int and the target class in the only categorical.
Can someone tell me what is happening and what to do so that dt = c.fit(X_train, y_train) will work? c = DecisionTreeClassifier(min_samples_split=100) c is a decision tree classifier.
This is my sample data set:

Pandas and scikit-learn: ValueError: Input contains NaN, infinity or a value too large for dtype('float64')

I am trying to fit Knearest Neigbors model to my data. but, I am getting this error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Here is the code to my Knearest Neigbhors algorithm:
def knn_train_test_new(training_col, target_col, df):
np.random.seed(1)
df = df.loc[np.random.permutation(len(df))]
# shuffled_index = np.random.permutation(df.index)
# df = df.reindex(shuffled_index)
train_df = df.iloc[0:150] #training set has 75% of the data
test_df = df.iloc[150:] #test set has 25% of the data
k = [5]
rmse = {}
for k_val in k:
model = KNeighborsRegressor(n_neighbors = k_val)
model.fit(train_df[training_col], train_df[target_col])
predictions = model.predict(test_df[training_col])
mse = mean_squared_error(test_df[target_col], predictions)
rmse[k_val] = (mse ** 0.5)
return rmse
two_features = ["width", "wheel-base"]
rmse_val = knn_train_test(two_features, 'price', numeric_cars)
And the top five rows of my dataframe:
numeric_cars.head()
I don't get this error when I use shuffled_index( which I have commented) instead of np.random.permutation. I don't clearly understand the difference between the two.
The complete error track:
ValueError Traceback (most recent call last)
<ipython-input-13-03e0dd7acfd0> in <module>()
25
26 two_features = ["width", "wheel-base"]
---> 27 rmse_val = knn_train_test_new(two_features, 'price', numeric_cars)
28
29 #rmse_results = {}
<ipython-input-13-03e0dd7acfd0> in knn_train_test_new(training_col, target_col, df)
14
15 model = KNeighborsRegressor(n_neighbors = k_val)
---> 16 model.fit(train_df[training_col], train_df[target_col])
17
18 predictions = model.predict(test_df[training_col])
~\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in fit(self, X, y)
743 """
744 if not isinstance(X, (KDTree, BallTree)):
--> 745 X, y = check_X_y(X, y, "csr", multi_output=True)
746 self._y = y
747 return self._fit(X)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
571 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
572 ensure_2d, allow_nd, ensure_min_samples,
--> 573 ensure_min_features, warn_on_dtype, estimator)
574 if multi_output:
575 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
451 % (array.ndim, estimator_name))
452 if force_all_finite:
--> 453 _assert_all_finite(array)
454
455 shape_repr = _shape_repr(array.shape)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X)
42 and not np.isfinite(X).all()):
43 raise ValueError("Input contains NaN, infinity"
---> 44 " or a value too large for %r." % X.dtype)
45
46
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Possible bug with `xarray.Dataset.groupby()`?

I'm using Xarray version 0.8.0, Python 3.5.1, on Mac OS X El Capitan 10.11.6.
The following code works as expected.
id_data_array = xarray.DataArray([280, 306, 280], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 280
score (index) float64 0.8358 0.7536 0.9495
======
<xarray.Dataset>
Dimensions: (id: 2)
Coordinates:
* id (id) int64 280 306
Data variables:
score (id) int64 2 1
In [ ]:
However, if I change just one little thing, to make the elements of id_data_array all distinct, then there is an error.
Code:
id_data_array = xarray.DataArray([280, 306, 120], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 120
score (index) float64 0.1353 0.0437 0.1687
======
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-92-cc412270ba2e> in <module>()
5 print(score_dataset)
6 print("======")
----> 7 print(score_dataset.groupby("id").count())
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, **kwargs)
44 return self.reduce(func, dim, keep_attrs,
45 numeric_only=numeric_only, allow_lazy=True,
---> 46 **kwargs)
47 return wrapped_func
48
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
605 def reduce_dataset(ds):
606 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 607 return self.apply(reduce_dataset)
608
609 def assign(self, **kwargs):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in apply(self, func, **kwargs)
562 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
563 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 564 combined = self._concat(applied)
565 result = self._maybe_restore_empty_groups(combined)
566 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in _concat(self, applied)
570 concat_dim, positions = self._infer_concat_args(applied_example)
571
--> 572 combined = concat(applied, concat_dim)
573 reordered = _maybe_reorder(combined, concat_dim, positions)
574 return reordered
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
114 raise TypeError('can only concatenate xarray Dataset and DataArray '
115 'objects, got %s' % type(first_obj))
--> 116 return f(objs, dim, data_vars, coords, compat, positions)
117
118
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
276 if coord is not None:
277 # add concat dimension last to ensure that its in the final Dataset
--> 278 result[coord.name] = coord
279
280 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
536 raise NotImplementedError('cannot yet use a dictionary as a key '
537 'to set Dataset values')
--> 538 self.update({key: value})
539
540 def __delitem__(self, key):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in update(self, other, inplace)
1434 dataset.
1435 """
-> 1436 variables, coord_names, dims = dataset_update_method(self, other)
1437
1438 return self._replace_vars_and_dims(variables, coord_names, dims,
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
490 priority_arg = 1
491 indexes = dataset.indexes
--> 492 return merge_core(objs, priority_arg=priority_arg, indexes=indexes)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
371
372 coerced = coerce_pandas_values(objs)
--> 373 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
374 expanded = expand_variable_dicts(aligned)
375
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in deep_align(list_of_variable_maps, join, copy, indexes)
146 out.append(variables)
147
--> 148 aligned = partial_align(*targets, join=join, copy=copy, indexes=indexes)
149
150 for key, aligned_obj in zip(keys, aligned):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in partial_align(*objects, **kwargs)
109 valid_indexers = dict((k, v) for k, v in joined_indexes.items()
110 if k in obj.dims)
--> 111 result.append(obj.reindex(copy=copy, **valid_indexers))
112 return tuple(result)
113
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, **kw_indexers)
1216
1217 variables = alignment.reindex_variables(
-> 1218 self.variables, self.indexes, indexers, method, tolerance, copy=copy)
1219 return self._replace_vars_and_dims(variables)
1220
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in reindex_variables(variables, indexes, indexers, method, tolerance, copy)
218 target = utils.safe_cast_to_index(indexers[name])
219 indexer = index.get_indexer(target, method=method,
--> 220 **get_indexer_kwargs)
221
222 to_shape[name] = len(target)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
2080
2081 if not self.is_unique:
-> 2082 raise InvalidIndexError('Reindexing only valid with uniquely'
2083 ' valued Index objects')
2084
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
To me this seems buggy because if this is the desired behaviour then it would be very strange. Surely, we should include the case when all the elements of the DataArray we're grouping by are distinct?
Update
I've now uninstalled and reinstalled Xarray. The new Xarray is version 0.8.1, and it seems to work fine. So it may indeed be a bug in Xarray 0.8.0.

Categories

Resources