Related
I'm trying to profile an excel file, it is a very small data set, only 30 columns and 535 rows, but when I run the profile_report function it stops each time in a different percentage but always has the same message:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-41-283dd2cb2000> in <module>
1 df=pd.read_excel(path_working+'Documents/Information/'+'sample.xlsx')
2 profile = df.profile_report(title='Sample Exploratory')
----> 3 profile.to_file(path_working+'sample.html')
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
276 create_html_assets(output_file)
277
--> 278 data = self.to_html()
279
280 if output_file.suffix != ".html":
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
384
385 """
--> 386 return self.html
387
388 def to_json(self) -> str:
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
199 def html(self):
200 if self._html is None:
--> 201 self._html = self._render_html()
202 return self._html
203
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
306 from pandas_profiling.report.presentation.flavours import HTMLReport
307
--> 308 report = self.report
309
310 disable_progress_bar = not config["progress_bar"].get(bool)
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
193 def report(self):
194 if self._report is None:
--> 195 self._report = get_report_structure(self.description_set)
196 return self._report
197
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
172 def description_set(self):
173 if self._description_set is None:
--> 174 self._description_set = describe_df(
175 self.title, self.df, self.summarizer, self.typeset, self._sample
176 )
~\anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df, summarizer, typeset, sample)
72 total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
73 ) as pbar:
---> 74 series_description = get_series_descriptions(df, summarizer, typeset, pbar)
75
76 pbar.set_postfix_str("Get variable types")
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, summarizer, typeset, pbar)
97 # TODO: use `Pool` for Linux-based systems
98 with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 99 for i, (column, description) in enumerate(
100 executor.imap_unordered(multiprocess_1d, args)
101 ):
~\anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 __next__ = next # XXX
~\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
76 """
77 column, series = args
---> 78 return column, describe_1d(series, summarizer, typeset)
79
80 pool_size = config["pool_size"].get(int)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series, summarizer, typeset)
50 vtype = typeset.detect_type(series)
51
---> 52 return summarizer.summarize(series, dtype=vtype)
53
54
~\anaconda3\lib\site-packages\pandas_profiling\model\summarizer.py in summarize(self, series, dtype)
54 """
55 summarizer_func = compose(self.summary_map.get(dtype, []))
---> 56 _, summary = summarizer_func(series, {"type": dtype})
57 return summary
58
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
17 def func(f, g):
18 def func2(*x):
---> 19 res = g(*x)
20 if type(res) == bool:
21 return f(*x)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in inner(series, summary)
70 if not summary["hashable"]:
71 return series, summary
---> 72 return fn(series, summary)
73
74 return inner
~\anaconda3\lib\site-packages\visions\utils\series_utils.py in inner(series, state, *args, **kwargs)
40 return False
41
---> 42 return fn(series, state, *args, **kwargs)
43
44 return inner
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in describe_numeric_1d(series, summary)
208
209 if chi_squared_threshold > 0.0:
--> 210 stats["chi_squared"] = chi_square(finite_values)
211
212 stats["range"] = stats["max"] - stats["min"]
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_helpers.py in chi_square(values, histogram)
352 def chi_square(values=None, histogram=None):
353 if histogram is None:
--> 354 histogram, _ = np.histogram(values, bins="auto")
355 return dict(chisquare(histogram)._asdict())
356
<__array_function__ internals> in histogram(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
790 a, weights = _ravel_and_check_weights(a, weights)
791
--> 792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
793
794 # Histogram is an integer or a float array depending on the weights.
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
444
445 # bin edges must be computed
--> 446 bin_edges = np.linspace(
447 first_edge, last_edge, n_equal_bins + 1,
448 endpoint=True, dtype=bin_type)
<__array_function__ internals> in linspace(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
126
127 delta = stop - start
--> 128 y = _nx.arange(0, num, dtype=dt).reshape((-1,) + (1,) * ndim(delta))
129 # In-place multiplication y *= delta/div is faster, but prevents the multiplicant
130 # from overriding what class is produced, and thus prevents, e.g. use of Quantities,
MemoryError: Unable to allocate 1.75 EiB for an array with shape (251938683619878560,) and data type float64
I ran the same code in a different python installation and it ran fine.
Thank you all in advance and let me know if you need more information.
This is a bug in numpy.histogram (https://github.com/numpy/numpy/issues/10297), also reported on SO (Numpy histogram extremely slow on small data set).
This error is caused by the call to np.histogram(x, bin='auto'). When input has very large values, the "auto" method can fail while trying to generate an enormous number of bin that cannot fit in ram.
As a workaround, you can remove the large values manually before generating the report.
I'm working on a Python project w/ Pandas and looking to implement a style to every Nth row. I've been able to select every Nth row using iloc but cannot get the style to work with a basic function. Here's my example in context:
data = [[1,2,3],[2,3,4],[3,4,5],[4,5,6]]
df = pd.DataFrame(data)
df
0 1 2
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
df.iloc[1::2, :]
0 1 2
1 2 3 4
3 4 5 6
At this point everything returns as normal, but when applying the below function, I receive a too many indexes error which I can't seem to resolve
def highlight_everyother(s):
if s.iloc[1::2, :]:
return ['background-color: yellow']*3
df.style.apply(highlight_everyother, axis=1)
ERROR:
IndexingError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _repr_html_(self)
180 Hooks into Jupyter notebook rich display system.
181 """
--> 182 return self.render()
183
184 #Appender(
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in render(self, **kwargs)
535 * table_attributes
536 """
--> 537 self._compute()
538 # TODO: namespace all the pandas keys
539 d = self._translate()
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _compute(self)
610 r = self
611 for func, args, kwargs in self._todo:
--> 612 r = func(self)(*args, **kwargs)
613 return r
614
~\Anaconda3\lib\site-packages\pandas\io\formats\style.py in _apply(self, func, axis, subset, **kwargs)
618 data = self.data.loc[subset]
619 if axis is not None:
--> 620 result = data.apply(func, axis=axis, result_type="expand", **kwargs)
621 result.columns = data.columns
622 else:
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
6876 kwds=kwds,
6877 )
-> 6878 return op.get_result()
6879
6880 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
184 return self.apply_raw()
185
--> 186 return self.apply_standard()
187
188 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
311
312 # compute the result using the series generator
--> 313 results, res_index = self.apply_series_generator()
314
315 # wrap results
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
339 else:
340 for i, v in enumerate(series_gen):
--> 341 results[i] = self.f(v)
342 keys.append(v.name)
343
<ipython-input-49-a5b996f8d6c8> in highlight_everyother(s)
11
12 def highlight_everyother(s):
---> 13 if s.iloc[1::2, :]:
14 return ['background-color: yellow']*3
15
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)
699 for i, k in enumerate(key):
700 if i >= self.ndim:
--> 701 raise IndexingError("Too many indexers")
702 try:
703 self._validate_key(k, i)
IndexingError: Too many indexers
Any help would be appreciated. Thank you.
I would apply on axis=0 in case df is not index by rangeIndex:
def highlight_everyother(s):
return ['background-color: yellow; color:blue' if x%2==1 else ''
for x in range(len(s))]
df.style.apply(highlight_everyother)
Output:
You are passing one row at a time to highlight_everyother. That's why you were getting the error. The below should work.
def highlight_everyother(s):
if s.name%2==1:
return ['background-color: yellow']*3
else:
return ['background-color: white']*3
df.style.apply(highlight_everyother, axis=1)
The code is for matching using record linkage toolkit. When I am setting the comparison it is giving me the TypeError: str argument expected error.
I don't understand what is wrong with the code:
compare = recordlinkage.Compare()
compare.exact('Sex Global', 'Sex')
compare.exact('Age', 'Age (Yrs) at presentation')
compare.string('ADD_LINE_1', 'Address',
method='levenshtein',
threshold=0.7)
compare.string('CITY', 'City',
threshold=0.8)
compare.string('STATE_C', 'State',
threshold=0.85)
features = compare.compute(combinations, phhs_abnormal, hhsc_copy)
traceback:
TypeError Traceback (most recent call last)
<ipython-input-24-d84008a8ba19> in <module>
13 threshold=0.85)
14 features = compare.compute(combinations, phhs_abnormal,
---> 15 hhsc_copy)
~/.local/lib/python3.7/site-packages/recordlinkage/base.py in compute(self, pairs, x, x_link)
863
864 if self.n_jobs == 1:
--> 865 results = self._compute(pairs, x, x_link)
866 elif self.n_jobs > 1:
867 results = self._compute_parallel(
~/.local/lib/python3.7/site-packages/recordlinkage/base.py in _compute(self, pairs, x, x_link)
727 )
728
--> 729 result = feat._compute(data1, data2)
730 features.append((result, feat.label))
731
~/.local/lib/python3.7/site-packages/recordlinkage/base.py in _compute(self, left_on, right_on)
447 numpy.ndarray objects.
448 """
--> 449 result = self._compute_vectorized(*tuple(left_on + right_on))
450
451 return result
~/.local/lib/python3.7/site-packages/recordlinkage/compare.py in _compute_vectorized(self, s_left, s_right)
148 self.method))
149
--> 150 c = str_sim_alg(s_left, s_right)
151
152 if self.threshold is not None:
~/.local/lib/python3.7/site-packages/recordlinkage/algorithms/string.py in levenshtein_similarity(s1, s2)
62 raise err
63
---> 64 return conc.apply(levenshtein_apply)
65
66
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
4040 else:
4041 values = self.astype(object).values
-> 4042 mapped = lib.map_infer(values, f, convert=convert_dtype)
4043
4044 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
~/.local/lib/python3.7/site-packages/recordlinkage/algorithms/string.py in levenshtein_apply(x)
60 return np.nan
61 else:
---> 62 raise err
63
64 return conc.apply(levenshtein_apply)
~/.local/lib/python3.7/site-packages/recordlinkage/algorithms/string.py in levenshtein_apply(x)
54
55 try:
---> 56 return 1 - jellyfish.levenshtein_distance(x[0], x[1]) \
57 / np.max([len(x[0]), len(x[1])])
58 except Exception as err:
TypeError: str argument expected
SpecificationError Traceback (most recent call last)
<ipython-input-42-d850d85f8342> in <module>
----> 1 train_label=extract_feature(train,train_label)
<ipython-input-33-23ab8dbf7d96> in extract_feature(df, train)
1 def extract_feature(df,train):
----> 2 t=groupy_feature(df,'ship','x',['max','min','mean','std','median','std','skew','sum'])
3 train=pd.merge(train,t,on='ship',how='left')
4 t=groupy_feature(df,'ship','y',['max','min','mean','std','median','std','skew','sum'])
5 train=pd.merge(train,t,on='ship',how='left')
<ipython-input-32-63d47754fe81> in groupy_feature(df, key, target, aggs)
4 agg_dict[f'{target}_{agg}']=agg
5 print(agg_dict)
----> 6 t=df.groupby(key)[target].agg(agg_dict).reset_index()
7 return t
~\AppData\Roaming\Python\Python37\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, *args, **kwargs)
251 # but not the class list / tuple itself.
252 func = _maybe_mangle_lambdas(func)
--> 253 ret = self._aggregate_multiple_funcs(func)
254 if relabeling:
255 ret.columns = columns
~\AppData\Roaming\Python\Python37\site-packages\pandas\core\groupby\generic.py in _aggregate_multiple_funcs(self, arg)
292 # GH 15931
293 if isinstance(self._selected_obj, Series):
--> 294 raise SpecificationError("nested renamer is not supported")
295
296 columns = list(arg.keys())
**SpecificationError: nested renamer is not supported**
I see two times the term 'std' in
t=groupy_feature(df,'ship','x',['max','min','mean','std','median','std','skew','sum'])
I'm using Xarray version 0.8.0, Python 3.5.1, on Mac OS X El Capitan 10.11.6.
The following code works as expected.
id_data_array = xarray.DataArray([280, 306, 280], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 280
score (index) float64 0.8358 0.7536 0.9495
======
<xarray.Dataset>
Dimensions: (id: 2)
Coordinates:
* id (id) int64 280 306
Data variables:
score (id) int64 2 1
In [ ]:
However, if I change just one little thing, to make the elements of id_data_array all distinct, then there is an error.
Code:
id_data_array = xarray.DataArray([280, 306, 120], coords={"index": range(3)})
random = numpy.random.rand(3)
score_data_array = xarray.DataArray(random, coords={"index": range(3)})
score_dataset = xarray.Dataset({"id": id_data_array, "score": score_data_array})
print(score_dataset)
print("======")
print(score_dataset.groupby("id").count())
Output:
<xarray.Dataset>
Dimensions: (index: 3)
Coordinates:
* index (index) int64 0 1 2
Data variables:
id (index) int64 280 306 120
score (index) float64 0.1353 0.0437 0.1687
======
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-92-cc412270ba2e> in <module>()
5 print(score_dataset)
6 print("======")
----> 7 print(score_dataset.groupby("id").count())
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, **kwargs)
44 return self.reduce(func, dim, keep_attrs,
45 numeric_only=numeric_only, allow_lazy=True,
---> 46 **kwargs)
47 return wrapped_func
48
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in reduce(self, func, dim, keep_attrs, **kwargs)
605 def reduce_dataset(ds):
606 return ds.reduce(func, dim, keep_attrs, **kwargs)
--> 607 return self.apply(reduce_dataset)
608
609 def assign(self, **kwargs):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in apply(self, func, **kwargs)
562 kwargs.pop('shortcut', None) # ignore shortcut if set (for now)
563 applied = (func(ds, **kwargs) for ds in self._iter_grouped())
--> 564 combined = self._concat(applied)
565 result = self._maybe_restore_empty_groups(combined)
566 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/groupby.py in _concat(self, applied)
570 concat_dim, positions = self._infer_concat_args(applied_example)
571
--> 572 combined = concat(applied, concat_dim)
573 reordered = _maybe_reorder(combined, concat_dim, positions)
574 return reordered
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in concat(objs, dim, data_vars, coords, compat, positions, indexers, mode, concat_over)
114 raise TypeError('can only concatenate xarray Dataset and DataArray '
115 'objects, got %s' % type(first_obj))
--> 116 return f(objs, dim, data_vars, coords, compat, positions)
117
118
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/combine.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions)
276 if coord is not None:
277 # add concat dimension last to ensure that its in the final Dataset
--> 278 result[coord.name] = coord
279
280 return result
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
536 raise NotImplementedError('cannot yet use a dictionary as a key '
537 'to set Dataset values')
--> 538 self.update({key: value})
539
540 def __delitem__(self, key):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in update(self, other, inplace)
1434 dataset.
1435 """
-> 1436 variables, coord_names, dims = dataset_update_method(self, other)
1437
1438 return self._replace_vars_and_dims(variables, coord_names, dims,
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
490 priority_arg = 1
491 indexes = dataset.indexes
--> 492 return merge_core(objs, priority_arg=priority_arg, indexes=indexes)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
371
372 coerced = coerce_pandas_values(objs)
--> 373 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
374 expanded = expand_variable_dicts(aligned)
375
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in deep_align(list_of_variable_maps, join, copy, indexes)
146 out.append(variables)
147
--> 148 aligned = partial_align(*targets, join=join, copy=copy, indexes=indexes)
149
150 for key, aligned_obj in zip(keys, aligned):
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in partial_align(*objects, **kwargs)
109 valid_indexers = dict((k, v) for k, v in joined_indexes.items()
110 if k in obj.dims)
--> 111 result.append(obj.reindex(copy=copy, **valid_indexers))
112 return tuple(result)
113
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, **kw_indexers)
1216
1217 variables = alignment.reindex_variables(
-> 1218 self.variables, self.indexes, indexers, method, tolerance, copy=copy)
1219 return self._replace_vars_and_dims(variables)
1220
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/xarray/core/alignment.py in reindex_variables(variables, indexes, indexers, method, tolerance, copy)
218 target = utils.safe_cast_to_index(indexers[name])
219 indexer = index.get_indexer(target, method=method,
--> 220 **get_indexer_kwargs)
221
222 to_shape[name] = len(target)
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
2080
2081 if not self.is_unique:
-> 2082 raise InvalidIndexError('Reindexing only valid with uniquely'
2083 ' valued Index objects')
2084
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
To me this seems buggy because if this is the desired behaviour then it would be very strange. Surely, we should include the case when all the elements of the DataArray we're grouping by are distinct?
Update
I've now uninstalled and reinstalled Xarray. The new Xarray is version 0.8.1, and it seems to work fine. So it may indeed be a bug in Xarray 0.8.0.