MemoryError on database import in Jupyter Notebook - python

I'm trying to import the database and place the files one under the other
months = {'jan': 1, 'fev':2, 'mar':3, 'abr':4, 'mai':5, 'jun':6, 'jul':7, 'ago':8, 'set':9, 'out':10, 'nov':11, 'dez':12}
base_path = pathlib.Path('dataset')
base_airbnb = pd.DataFrame()
for file in base_path.iterdir():
month_name = file.name[:3]
month = months[month_name]
year = file.name[-8:]
year = int(year.replace('.csv', ''))
df = pd.read_csv(base_path / file.name)
df['year'] = year
df['month'] = month
base_airbnb = base_airbnb.append(df)
display(base_airbnb)
is not aligned. A future version of pandas will change to not sort by
default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass
'sort=True'.
It appears that I have MemoryError:. But something goes wrong and I don't understand why... I'm sure I'm doing everything correctly in Jupyter Notebook
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass
'sort=True'.
sort=sort,
C:\Users\CASA\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (87) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
--------------------------------------------------------------------------- MemoryError Traceback (most recent call last) <ipython-input-7-e9e45fb3206d> in <module>
15 df['year'] = year
16 df['month'] = month
---> 17 base_airbnb = base_airbnb.append(df)
18
19 big_data=pd.concat(months, axis=0)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in append(self, other, ignore_index, verify_integrity, sort) 7121 ignore_index=ignore_index, 7122 verify_integrity=verify_integrity,
-> 7123 sort=sort, 7124 ) 7125
~\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
253 verify_integrity=verify_integrity,
254 copy=copy,
--> 255 sort=sort,
256 )
257
~\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in
__init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy, sort)
333
334 # consolidate
--> 335 obj._consolidate(inplace=True)
336 ndims.add(obj.ndim)
337
~\Anaconda3\lib\site-packages\pandas\core\generic.py in
_consolidate(self, inplace) 5268 inplace = validate_bool_kwarg(inplace, "inplace") 5269 if inplace:
-> 5270 self._consolidate_inplace() 5271 else: 5272 f = lambda: self._data.consolidate()
~\Anaconda3\lib\site-packages\pandas\core\generic.py in
_consolidate_inplace(self) 5250 self._data = self._data.consolidate() 5251
-> 5252 self._protect_consolidate(f) 5253 5254 def _consolidate(self, inplace=False):
~\Anaconda3\lib\site-packages\pandas\core\generic.py in
_protect_consolidate(self, f) 5239 """ 5240 blocks_before = len(self._data.blocks)
-> 5241 result = f() 5242 if len(self._data.blocks) != blocks_before: 5243 self._clear_item_cache()
~\Anaconda3\lib\site-packages\pandas\core\generic.py in f() 5248 5249 def f():
-> 5250 self._data = self._data.consolidate() 5251 5252 self._protect_consolidate(f)
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in consolidate(self)
930 bm = self.__class__(self.blocks, self.axes)
931 bm._is_consolidated = False
--> 932 bm._consolidate_inplace()
933 return bm
934
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in
_consolidate_inplace(self)
935 def _consolidate_inplace(self):
936 if not self.is_consolidated():
--> 937 self.blocks = tuple(_consolidate(self.blocks))
938 self._is_consolidated = True
939 self._known_consolidated = True
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in
_consolidate(blocks) 1911 for (_can_consolidate, dtype), group_blocks in grouper: 1912 merged_blocks =
_merge_blocks(
-> 1913 list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate 1914 ) 1915 new_blocks = _extend_blocks(merged_blocks, new_blocks)
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in
_merge_blocks(blocks, dtype, _can_consolidate) 3318 # combination of those slices is a slice, too. 3319 new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
-> 3320 new_values = np.vstack([b.values for b in blocks]) 3321 3322 argsort = np.argsort(new_mgr_locs)
~\Anaconda3\lib\site-packages\numpy\core\shape_base.py in vstack(tup)
281 """
282 _warn_for_nonsequence(tup)
--> 283 return _nx.concatenate([atleast_2d(_m) for _m in tup], 0)
284
285
MemoryError:

Related

.describe() and .info() not working for me in Jupyter Notebook

I am trying to use the describe method to get summary statistics of my data but I keep on getting this error message. Anyway to sort this out? The .info() is also giving me the same problem.
TypeError Traceback (most recent call last)
<ipython-input-28-614cd2726f37> in <module>
----> 1 players_final.describe()
~\anaconda3\lib\site-packages\pandas\core\generic.py in describe(self, percentiles, include, exclude)
10265 elif (include is None) and (exclude is None):
10266 # when some numerics are found, keep only numerics
> 10267 data = self.select_dtypes(include=[np.number])
10268 if len(data.columns) == 0:
10269 data = self
~\anaconda3\lib\site-packages\pandas\core\frame.py in select_dtypes(self, include, exclude)
3420 # the "union" of the logic of case 1 and case 2:
3421 # we get the included and excluded, and return their logical and
-> 3422 include_these = Series(not bool(include), index=self.columns)
3423 exclude_these = Series(not bool(exclude), index=self.columns)
3424
~\anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
309 data = data.copy()
310 else:
--> 311 data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
312
313 data = SingleBlockManager(data, index, fastpath=True)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
710 value = maybe_cast_to_datetime(value, dtype)
711
--> 712 subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
713
714 else:
~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1231 value = ensure_str(value)
1232
-> 1233 subarr = np.empty(length, dtype=dtype)
1234 subarr.fill(value)
1235
TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type
​

loc API of xarray gives KeyError: "not all values found in index"

I am having issues with the loc API of xarray. I am trying to select data that satisfy a certain condition. Below should be a reproducible example:
import xarray as xr
da = xr.tutorial.load_dataset('air_temperature').air
mask = 100*xr.ones_like(da[0])
da[0].loc[da[0]<mask]
but this gives the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-238-75b6b6f544d4> in <module>
1 da = xr.tutorial.load_dataset('air_temperature').air
2 mask = 100*xr.ones_like(da[0])
----> 3 da[0].loc[da[0]<mask]
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/dataarray.py in __getitem__(self, key)
194 labels = indexing.expanded_indexer(key, self.data_array.ndim)
195 key = dict(zip(self.data_array.dims, labels))
--> 196 return self.data_array.sel(**key)
197
198 def __setitem__(self, key, value) -> None:
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/dataarray.py in sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
1045 method=method,
1046 tolerance=tolerance,
-> 1047 **indexers_kwargs
1048 )
1049 return self._from_temp_dataset(ds)
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/dataset.py in sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
1998 indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "sel")
1999 pos_indexers, new_indexes = remap_label_indexers(
-> 2000 self, indexers=indexers, method=method, tolerance=tolerance
2001 )
2002 result = self.isel(indexers=pos_indexers, drop=drop)
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/coordinates.py in remap_label_indexers(obj, indexers, method, tolerance, **indexers_kwargs)
390
391 pos_indexers, new_indexes = indexing.remap_label_indexers(
--> 392 obj, v_indexers, method=method, tolerance=tolerance
393 )
394 # attach indexer's coordinate to pos_indexers
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/indexing.py in remap_label_indexers(data_obj, indexers, method, tolerance)
259 coords_dtype = data_obj.coords[dim].dtype
260 label = maybe_cast_to_coords_dtype(label, coords_dtype)
--> 261 idxr, new_idx = convert_label_indexer(index, label, dim, method, tolerance)
262 pos_indexers[dim] = idxr
263 if new_idx is not None:
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/indexing.py in convert_label_indexer(index, label, index_name, method, tolerance)
191 indexer = get_indexer_nd(index, label, method, tolerance)
192 if np.any(indexer < 0):
--> 193 raise KeyError("not all values found in index %r" % index_name)
194 return indexer, new_index
195
KeyError: "not all values found in index 'lat'"
Since the mask is identical to the xarray.DataArray da in terms of dimensions and coordinates, I don't think this error makes sense... Am I missing something or is this possibly a bug?
Thank you in advance for your help.

ValueError when using .diff() with dask dataframe

I have a large time series data set which I want to process with Dask.
apart from a few other columns, there is a column called 'id' which identifies individuals and a column transc_date which identifies the date and a column transc_time identifying the time when an individual made a transaction.
The data is sorted using:
df = df.map_partitions(lambda x: x.sort_values(['id', 'transc_date', 'transc_time'], ascending=[True, True, True]))
transc_time is of type int and transc_date is of type datetime64.
I want to create a new column which gives me for each individual the number of days since the last transaction. For this I created the following function:
def get_diff_since_last_trans(df, plot=True):
df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
if plot:
sns.distplot(diffs.values, kde = False, rug = False)
return diffs
When I try this function on a small subset of the data (200k rows) it works as intended. But when I use it on the full data set I get a ValueErro below.
I dropped all ids which have fewer than 10 occurrences first. transc_date does not contain nans, it only contains datetime64 entries.
Any idea what's going wrong?
ValueError Traceback (most recent call last)
<ipython-input-12-551d7256f328> in <module>()
1 a = get_diff_first_last_trans(df, plot=False)
----> 2 b = get_diff_since_last_trans(df, plot=False)
3 plot_trans_diff(a,b)
<ipython-input-10-8f83d4571659> in get_diff_since_last_trans(df, plot)
12 def get_diff_since_last_trans(df, plot=True):
13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
---> 14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
16 sns.distplot(diffs.values, kde = False, rug = False)
~/venv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135(result,)= compute(self, traverse=False,**kwargs) 136return result
137
~/venv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/venv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
1997 secede()
1998 try:
-> 1999 results = self.gather(packed, asynchronous=asynchronous)
2000 finally:
2001 for f in futures.values():
~/venv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1435 return self.sync(self._gather, futures, errors=errors,
1436 direct=direct, local_worker=local_worker,
-> 1437 asynchronous=asynchronous)
1438
1439 #gen.coroutine
~/venv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
590 return future
591 else:
--> 592return sync(self.loop, func,*args,**kwargs) 593 594def __repr__(self):
~/venv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693raise value
694finally: 695 value =None
~/venv/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/venv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/venv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/venv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1313 six.reraise(type(exception),
1314 exception,
-> 1315 traceback)
1316 if errors == 'skip':
1317 bad_keys.add(key)
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692raise value.with_traceback(tb) 693raise value
694finally:
~/venv/lib/python3.6/site-packages/dask/dataframe/rolling.py in overlap_chunk()
30 parts = [p for p in (prev_part, current_part, next_part) if p is not None]
31 combined = pd.concat(parts)
---> 32 out = func(combined, *args, **kwargs)
33 if prev_part is None:
34 before = None
<ipython-input-10-8f83d4571659> in <lambda>()
11
12 def get_diff_since_last_trans(df, plot=True):
---> 13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
~/venv/lib/python3.6/site-packages/pandas/core/groupby.py in wrapper()
737 *args, **kwargs)
738 except (AttributeError):
--> 739raise ValueError
740 741return wrapper
ValueError:

Pandas groupby and describe flags AttributeError

I have a bunch of data stored in vals. The indices are monotonic, but not continuous. I'm attempting to do some analysis on histograms of the data, so I've created the following structure:
hist = pd.DataFrame(vals)
hist['bins'] = pd.cut(vals, 100)
This is data taken from an experimental instrument and I know that some of the bins have only 1 or 2 counts in them, which I'm trying to remove. I've tried using groupby as follows and get the following error (Full traceback included at the end of the note):
hist.groupby('bins').describe()
AttributeError: 'Categorical' object has no attribute 'flags'
However, when I do the following, the error does not show up and I get the expected result:
In[]: hist.index = hist.bins
In[]: hist['bins'] = hist.index
In[]: desc = hist.groupby('bins').describe()
In[]: desc.index.names = ['bins', 'describe']
Out[]: **describe with MultiIndex for rows.**
If I don't include the second line hist['bins'] = hist.index, I still get an AttributeError: 'Categorical' object has no attribute 'flags' and to the best that I can tell, the traceback is identical.
Can someone explain what the flags are and why they only seem to work when I set the index to bins and then replace the bins by the version stored in the index?
My end goal is to remove the data for bins with counts <= 6. If someone has an easier workaround than the way I'm going after it, I'd also be grateful.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-f606a051f2e4> in <module>()
----> 1 hist.groupby('bins').describe()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in __call__(self, result)
245 self.start_displayhook()
246 self.write_output_prompt()
--> 247 format_dict, md_dict = self.compute_format_data(result)
248 self.write_format_data(format_dict, md_dict)
249 self.update_user_ns(result)
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\displayhook.pyc in compute_format_data(self, result)
155
156 """
--> 157 return self.shell.display_formatter.format(result)
158
159 def write_format_data(self, format_dict, md_dict=None):
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in format(self, obj, include, exclude)
150 md = None
151 try:
--> 152 data = formatter(obj)
153 except:
154 # FIXME: log the exception
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\core\formatters.pyc in __call__(self, obj)
479 type_pprinters=self.type_printers,
480 deferred_pprinters=self.deferred_printers)
--> 481 printer.pretty(obj)
482 printer.flush()
483 return stream.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in pretty(self, obj)
360 if callable(meth):
361 return meth(obj, self, cycle)
--> 362 return _default_pprint(obj, self, cycle)
363 finally:
364 self.end_group()
C:\Users\balterma\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.4.1.1975.win-x86_64\lib\site-packages\IPython\lib\pretty.pyc in _default_pprint(obj, p, cycle)
480 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
481 # A user-provided repr.
--> 482 p.text(repr(obj))
483 return
484 p.begin_group(1, '<')
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __repr__(self)
62 Yields Bytestring in Py2, Unicode String in py3.
63 """
---> 64 return str(self)
65
66
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __str__(self)
42 if compat.PY3:
43 return self.__unicode__()
---> 44 return self.__bytes__()
45
46 def __bytes__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\base.pyc in __bytes__(self)
54
55 encoding = get_option("display.encoding")
---> 56 return self.__unicode__().encode(encoding, 'replace')
57
58 def __repr__(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in __unicode__(self)
507 width = None
508 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 509 line_width=width, show_dimensions=show_dimensions)
510
511 return buf.getvalue()
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\frame.pyc in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, line_width, max_rows, max_cols, show_dimensions)
1340 max_rows=max_rows,
1341 max_cols=max_cols,
-> 1342 show_dimensions=show_dimensions)
1343 formatter.to_string()
1344
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in __init__(self, frame, buf, columns, col_space, header, index, na_rep, formatters, justify, float_format, sparsify, index_names, line_width, max_rows, max_cols, show_dimensions, **kwds)
345 self.columns = frame.columns
346
--> 347 self._chk_truncate()
348
349 def _chk_truncate(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\format.pyc in _chk_truncate(self)
410 else:
411 row_num = max_rows_adj // 2
--> 412 frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :]))
413 self.tr_row_num = row_num
414
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_new_axes(self)
957 new_axes[i] = ax
958
--> 959 new_axes[self.axis] = self._get_concat_axis()
960 return new_axes
961
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _get_concat_axis(self)
1009
1010 if self.keys is None:
-> 1011 concat_axis = _concat_indexes(indexes)
1012 else:
1013 concat_axis = _make_concat_multiindex(indexes, self.keys,
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\tools\merge.pyc in _concat_indexes(indexes)
1027
1028 def _concat_indexes(indexes):
-> 1029 return indexes[0].append(indexes[1:])
1030
1031
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in append(self, other)
4603 arrays = []
4604 for i in range(self.nlevels):
-> 4605 label = self.get_level_values(i)
4606 appended = [o.get_level_values(i) for o in other]
4607 arrays.append(label.append(appended))
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\index.pyc in get_level_values(self, level)
4239 unique = self.levels[num] # .values
4240 labels = self.labels[num]
-> 4241 filled = com.take_1d(unique.values, labels, fill_value=unique._na_value)
4242 values = unique._simple_new(filled, self.names[num],
4243 freq=getattr(unique, 'freq', None),
C:\Users\balterma\AppData\Local\Enthought\Canopy\User\lib\site-packages\pandas\core\common.pyc in take_nd(arr, indexer, axis, out, fill_value, mask_info, allow_fill)
829 out_shape[axis] = len(indexer)
830 out_shape = tuple(out_shape)
--> 831 if arr.flags.f_contiguous and axis == arr.ndim - 1:
832 # minor tweak that can make an order-of-magnitude difference
833 # for dataframes initialized directly from 2-d ndarrays
AttributeError: 'Categorical' object has no attribute 'flags'
This looks to be be a bug with Categorical data that will be corrected in version 0.17.0 (issue here).
In the meantime, you could just cast the category to an object dtype - this is what was happening when you assigned to the index and back.
df['bins'] = df['bins'].astype(str)

Pandas Join results in keyerror on index column

Join results in keyerror on index column
# Import libraries
import pandas as pd
import numpy as np
# Open and load all files indexed by 'ISI_LOC'
df_all = pd.read_csv('AUTHORS.csv', index_col='ISI_LOC', dtype={'ISI_LOC':str, 'POSITION':int}, engine='c', low_memory=False)
df_addresses = pd.read_csv('ADDRESSES.csv', index_col='ISI_LOC', dtype={'ISI_LOC': str, 'POSITION':int, 'Seg1':str }, engine='c', low_memory=False)
# There are more, but for the sake of brevity...
an inspection of the dataframes show index on string as expected
# Goal: df_all.join([df_addresses, df_catagories, df_keywordsplus, df_articles])
df_all.join(df_addresses, on='ISI_LOC')
This results in:
KeyError Traceback (most recent call last)
<ipython-input-17-35d37498b69e> in <module>()
1 # df_all.join([df_addresses, df_catagories, df_keywordsplus, df_articles])
----> 2 df_all.join(df_addresses, on='ISI_LOC')
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in join(self, other, on, how, lsuffix, rsuffix, sort)
3865 # For SparseDataFrame's benefit
3866 return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
-> 3867 rsuffix=rsuffix, sort=sort)
3868
3869 def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in _join_compat(self, other, on, how, lsuffix, rsuffix, sort)
3879 return merge(self, other, left_on=on, how=how,
3880 left_index=on is None, right_index=True,
-> 3881 suffixes=(lsuffix, rsuffix), sort=sort)
3882 else:
3883 if on is not None:
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\tools\merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy)
36 right_on=right_on, left_index=left_index,
37 right_index=right_index, sort=sort, suffixes=suffixes,
---> 38 copy=copy)
39 return op.get_result()
40 if __debug__:
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\tools\merge.py in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy)
182 (self.left_join_keys,
183 self.right_join_keys,
--> 184 self.join_names) = self._get_merge_keys()
185
186 def get_result(self):
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\tools\merge.py in _get_merge_keys(self)
359 join_names.append(None)
360 else:
--> 361 left_keys.append(left[k].values)
362 join_names.append(k)
363 if isinstance(self.right.index, MultiIndex):
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1778 return self._getitem_multilevel(key)
1779 else:
-> 1780 return self._getitem_column(key)
1781
1782 def _getitem_column(self, key):
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1785 # get column
1786 if self.columns.is_unique:
-> 1787 return self._get_item_cache(key)
1788
1789 # duplicate columns & possible reduce dimensionaility
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1066 res = cache.get(item)
1067 if res is None:
-> 1068 values = self._data.get(item)
1069 res = self._box_item_values(item, values)
1070 cache[item] = res
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
2847
2848 if not isnull(item):
-> 2849 loc = self.items.get_loc(item)
2850 else:
2851 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\430010958\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\index.py in get_loc(self, key)
1400 loc : int if unique index, possibly slice or mask if not
1401 """
-> 1402 return self._engine.get_loc(_values_from_object(key))
1403
1404 def get_value(self, series, key):
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3807)()
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3687)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12310)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12261)()
KeyError: 'ISI_LOC'
Yes, I can use other methods. In fact, I have it working in a rather fugly way using this syntax, so I know that the data is formatted correctly:
df_catagories = pd.concat([df_catagories, df_keywordsplus], keys='ISI_LOC')
Which works, but not in the way I'd like it to. What am I missing on the join statement? I have played with 'how=' and other parameters without success.

Categories

Resources