I'm playing around with Python Dask. I followed their dataframe example jupyter notebook but failed at the step when converting a dask dataframe to pandas data frame by calling the compute() function. Would anyone please advise what I did wrong?
Code:
### Cell0
!pip install "dask[complete]"
!pip install pandas
### Cell1
import dask
import dask.dataframe as dd
df = dask.datasets.timeseries()
df
### Cell2
df2 = df[df.y > 0]
df3 = df2.groupby('name').x.std()
df3
### Cell3
computed_df = df3.compute()
type(computed_df)
Error raised when executing computed_df = df3.compute() in cell 3.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-6-6da1eef50c1d> in <module>
----> 1 computed_df = df3.compute()
2 type(computed_df)
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs)
283 dask.base.compute
284 """
--> 285 (result,) = compute(self, traverse=False, **kwargs)
286 return result
287
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/base.py in compute(*args, **kwargs)
559 )
560
--> 561 dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
562 keys, postcomputes = [], []
563 for x in collections:
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/base.py in collections_to_dsk(collections, optimize_graph, optimizations, **kwargs)
335 for opt, val in groups.items():
336 dsk, keys = _extract_graph_and_keys(val)
--> 337 dsk = opt(dsk, keys, **kwargs)
338
339 for opt in optimizations:
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/optimize.py in optimize(dsk, keys, **kwargs)
20 else:
21 # Perform Blockwise optimizations for HLG input
---> 22 dsk = optimize_dataframe_getitem(dsk, keys=keys)
23 dsk = optimize_blockwise(dsk, keys=keys)
24 dsk = fuse_roots(dsk, keys=keys)
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/optimize.py in optimize_dataframe_getitem(dsk, keys)
103 # Project columns and update blocks
104 old = layers[k]
--> 105 new = old.project_columns(columns)[0]
106 if new.name != old.name:
107 columns = list(columns)
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/layers.py in project_columns(self, columns)
941 # Apply column projection in IO function
942 try:
--> 943 io_func = self.io_func.project_columns(list(columns))
944 except AttributeError:
945 io_func = self.io_func
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/io/demo.py in project_columns(self, columns)
87 func = copy.deepcopy(self)
88 func.columns = columns
---> 89 func.dtypes = {c: self.dtypes[c] for c in columns}
90 return func
91
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/io/demo.py in <dictcomp>(.0)
87 func = copy.deepcopy(self)
88 func.columns = columns
---> 89 func.dtypes = {c: self.dtypes[c] for c in columns}
90 return func
91
KeyError: 'gt-d5f81fc97f91e68c389fc34631419acc'
Interesting, I can reproduce this bug with:
python=3.9.4
pandas=1.2.4
dask=2021.5.0
distributed=2021.5.0
Specifically, the error occurs in this step:
df2 = df[df.y > 0]
I raised an issue on GitHub, but in the meantime downgrading dask version to 2021.4.0 resolves the problem (the computed result will show):
python=3.9.4
pandas=1.2.4
dask=2021.4.1
distributed=2021.4.1
(note Python here is 3.9, which seems to be the case in your environment)
Related
I'm trying to run a fast fourier transform on a pandas dataframe that I have. I am using the Kepler exoplanet dataset, here, and a specific notebook for it, here. I recreate the code in cells 27-30 (Note that the code in cell 29 is executed elsewhere, thus both dataframes have the same shape as the original notebook), which looks as follows:
import scipy
def spectrum_getter(X):
Spectrum = scipy.fft.fft(X, n=X.size)
return np.abs(Spectrum)
x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
x_test_FT = x_test.apply(spectrum_getter, axis=1)
Both x_train_OS and x_test are pandas.core.frame.DataFrame. Running this produces:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [245], in <module>
----> 1 x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
2 x_test_FT = x_test.apply(spectrum_getter, axis=1)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\frame.py:8827, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8816 from pandas.core.apply import frame_apply
8818 op = frame_apply(
8819 self,
8820 func=func,
(...)
8825 kwargs=kwargs,
8826 )
-> 8827 return op.apply().__finalize__(self, method="apply")
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:851, in FrameApply.apply_standard(self)
850 def apply_standard(self):
--> 851 results, res_index = self.apply_series_generator()
853 # wrap results
854 return self.wrap_results(results, res_index)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:867, in FrameApply.apply_series_generator(self)
864 with option_context("mode.chained_assignment", None):
865 for i, v in enumerate(series_gen):
866 # ignore SettingWithCopy here in case the user mutates
--> 867 results[i] = self.f(v)
868 if isinstance(results[i], ABCSeries):
869 # If we have a view on v, we need to make a copy because
870 # series_generator will swap out the underlying data
871 results[i] = results[i].copy(deep=False)
Input In [244], in spectrum_getter(X)
3 def spectrum_getter(X):
----> 4 Spectrum = scipy.fft.fft(X, n=X.size)
5 return np.abs(Spectrum)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_backend.py:22, in _ScipyBackend.__ua_function__(method, args, kwargs)
20 if fn is None:
21 return NotImplemented
---> 22 return fn(*args, **kwargs)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\basic.py:17, in c2c(forward, x, n, axis, norm, overwrite_x, workers, plan)
14 if plan is not None:
15 raise NotImplementedError('Passing a precomputed plan is not yet '
16 'supported by scipy.fft functions')
---> 17 tmp = _asfarray(x)
18 overwrite_x = overwrite_x or _datacopied(tmp, x)
19 norm = _normalization(norm, forward)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\helper.py:97, in _asfarray(x)
95 dtype = x.dtype.newbyteorder('=')
96 # Always align input
---> 97 copy = not x.flags['ALIGNED']
98 return np.array(x, dtype=dtype, copy=copy)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\flags.py:98, in Flags.__getitem__(self, key)
96 def __getitem__(self, key):
97 if key not in self._keys:
---> 98 raise KeyError(key)
100 return getattr(self, key)
KeyError: 'ALIGNED'
I attempted to convert the dataframe to a numpy array, but ran into other issues. What am I doing wrong here?
I ran to the same error so I converted my datatype to dataframe and it solved my problem.
I want to use ggplot2 within Jupyter Notebook. However, when I try to make an R magic cell and introduce a variable, I get an error.
Here is the code (one paragraph indicates one cell):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import rpy2
%matplotlib inline
from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython
%%R
library(ggplot2)
data = pd.read_csv('train_titanic.csv')
%%R -i data -w 900 -h 480 -u px
With this last cell, I get the following error (incl traceback):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
54 try:
---> 55 od[name] = conversion.py2rpy(values)
56 except Exception as e:
~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
839
--> 840 return dispatch(args[0].__class__)(*args, **kw)
841
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasseries(obj)
125 if type(x) is not homogeneous_type:
--> 126 raise ValueError('Series can only be of one type, or None.')
127 # TODO: Could this be merged with obj.type.name == 'O' case above ?
ValueError: Series can only be of one type, or None.
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
367 try:
--> 368 mv = memoryview(obj)
369 res = cls.from_memoryview(mv)
TypeError: memoryview: a bytes-like object is required, not 'Series'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-14-75e210679e4a> in <module>
----> 1 get_ipython().run_cell_magic('R', '-i data -w 900 -h 480 -u px', '\n\n')
~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2360 with self.builtin_trap:
2361 args = (magic_arg_s, cell)
-> 2362 result = fn(*args, **kwargs)
2363 return result
2364
</home/morgan/anaconda3/envs/catenv/lib/python3.7/site-packages/decorator.py:decorator-gen-130> in R(self, line, cell, local_ns)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/ipython/rmagic.py in R(self, line, cell, local_ns)
721 raise NameError("name '%s' is not defined" % input)
722 with localconverter(converter) as cv:
--> 723 ro.r.assign(input, val)
724
725 tmpd = self.setup_graphics(args)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
190 kwargs[r_k] = v
191 return (super(SignatureTranslatedFunction, self)
--> 192 .__call__(*args, **kwargs))
193
194
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in __call__(self, *args, **kwargs)
111
112 def __call__(self, *args, **kwargs):
--> 113 new_args = [conversion.py2rpy(a) for a in args]
114 new_kwargs = {}
115 for k, v in kwargs.items():
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/functions.py in <listcomp>(.0)
111
112 def __call__(self, *args, **kwargs):
--> 113 new_args = [conversion.py2rpy(a) for a in args]
114 new_kwargs = {}
115 for k, v in kwargs.items():
~/anaconda3/envs/catenv/lib/python3.7/functools.py in wrapper(*args, **kw)
838 '1 positional argument')
839
--> 840 return dispatch(args[0].__class__)(*args, **kw)
841
842 funcname = getattr(func, '__name__', 'singledispatch function')
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py in py2rpy_pandasdataframe(obj)
59 'The error is: %s'
60 % (name, str(e)))
---> 61 od[name] = StrVector(values)
62
63 return DataFrame(od)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/robjects/vectors.py in __init__(self, obj)
382
383 def __init__(self, obj):
--> 384 super().__init__(obj)
385 self._add_rops()
386
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in __init__(self, obj)
286 super().__init__(obj)
287 elif isinstance(obj, collections.abc.Sized):
--> 288 super().__init__(type(self).from_object(obj).__sexp__)
289 else:
290 raise TypeError('The constructor must be called '
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_object(cls, obj)
370 except (TypeError, ValueError):
371 try:
--> 372 res = cls.from_iterable(obj)
373 except ValueError:
374 msg = ('The class methods from_memoryview() and '
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _(*args, **kwargs)
26 def _cdata_res_to_rinterface(function):
27 def _(*args, **kwargs):
---> 28 cdata = function(*args, **kwargs)
29 # TODO: test cdata is of the expected CType
30 return _cdata_to_rinterface(cdata)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in from_iterable(cls, iterable, populate_func)
317 if populate_func is None:
318 cls._populate_r_vector(iterable,
--> 319 r_vector)
320 else:
321 populate_func(iterable, r_vector)
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(cls, iterable, r_vector)
300 r_vector,
301 cls._R_SET_VECTOR_ELT,
--> 302 cls._CAST_IN)
303
304 #classmethod
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _populate_r_vector(iterable, r_vector, set_elt, cast_value)
237 def _populate_r_vector(iterable, r_vector, set_elt, cast_value):
238 for i, v in enumerate(iterable):
--> 239 set_elt(r_vector, i, cast_value(v))
240
241
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/sexp.py in _as_charsxp_cdata(x)
430 return x.__sexp__._cdata
431 else:
--> 432 return conversion._str_to_charsxp(x)
433
434
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_charsxp(val)
118 s = rlib.R_NaString
119 else:
--> 120 cchar = _str_to_cchar(val)
121 s = rlib.Rf_mkCharCE(cchar, _CE_UTF8)
122 return s
~/anaconda3/envs/catenv/lib/python3.7/site-packages/rpy2/rinterface_lib/conversion.py in _str_to_cchar(s, encoding)
97 def _str_to_cchar(s, encoding: str = 'utf-8'):
98 # TODO: use isStrinb and installTrChar
---> 99 b = s.encode(encoding)
100 return ffi.new('char[]', b)
101
AttributeError: 'float' object has no attribute 'encode'
So I find that it is not possible to even start an R magic cell while importing my pandas dataframe object. However, I have tried creating R vectors inside the cell, and find I can plot these using ggplot2 with no issues.
I am using Python 3.7.6, rpy2 3.1.0, jupyter-notebook 6.0.3and am using Ubuntu 18.04.2 LTS on Windows Subsystem for Linux.
The problem is most likely with one (or more) columns having more than one type - therefore it is impossible to transfer the data into an R vector (which can hold only one data type). The traceback may be overwhelming, but here is the relevant part:
ValueError: Series can only be of one type, or None.
Which column it is? Difficult to say without looking at the dataset that you load, but my general solution is to check the types in the columns:
types = data.applymap(type).apply(set)
types[types.apply(len) > 1]
Anything returned by the snippet above would be a candidate culprit. There are many different ways of dealing with the problem, depending on the exact nature of the data. Workarounds that I frequently use include:
calling data = data.infer_objects() - helps if the pandas did not catch up with a dtype change and still stores the data with (suboptimal) Python objects
filling NaN with an empty string or a string constant if you have missing values in a string column (e.g. str_columns = str_columns.fillna(''))
dates.apply(pd.to_datetime, axis=1) if you have datetime objects but the dtype is object
using df.applymap(lambda x: datetime.combine(x, datetime.min.time()) if not isinstance(x, datetime) else x) if you have a mixture of date and datetime objects
In some vary rare cases pandas stores the data differently than expected by rpy2 (following certain manipulations); then writing the dataframe down to a csv file and reading it from the disk again helps - but this is likely not what you are facing here, as you start from a newly read dataframe.
I just noticed there might be an even simpler reason for the problem. For some reason, pandas2ri requires you to call pandas2ri.activate()after importing it. This solved the problem for me.
I'm having some probems apllying a text search algorithm with parallelized dask insfrastructure.
I'm tryng to find the best match for 40,000 stirngs in a series object against a 4000 string list.
I could have done it using pandas.apply but it's to time expensive, so i decided try parallelization with map_partitions in dask.
I'm using this text search library with python-Levenshtein https://marcobonzanini.com/2015/02/25/fuzzy-string-matching-in-python
As you can see, it works ok on this example from a pandas dataset:
process.extractOne(df['endereco2'][1],choices=choices,scorer=fuzz.token_set_ratio,
score_cutoff=60)
Output: ('R ALVARO DUARTE DE ALMEIDA PROFESSOR', 85)
but its not working while using dask:
from dask import dataframe as dd
sd = dd.from_pandas(r13_2["endereco2"],npartitions=3).map_partitions(lambda df : df.apply(process.extractOne,choices=choices,scorer=fuzz.token_set_ratio,score_cutoff=60)).compute(scheduler='processes')
Output:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-69-f39ab0d086b5> in <module>
1 from dask import dataframe as dd
----> 2 sd = dd.from_pandas(r13_2["endereco2"],npartitions=3).map_partitions(lambda df : df.apply(process.extractOne,choices=choices,scorer=fuzz.token_set_ratio,score_cutoff=60)).compute(scheduler='processes')
~\Anaconda3\envs\mono\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\Anaconda3\envs\mono\lib\site-packages\dask\base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
~\Anaconda3\envs\mono\lib\site-packages\dask\multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, **kwargs)
190 get_id=_process_get_id, dumps=dumps, loads=loads,
191 pack_exception=pack_exception,
--> 192 raise_exception=reraise, **kwargs)
193 finally:
194 if cleanup:
~\Anaconda3\envs\mono\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
460 _execute_task(task, data) # Re-execute locally
461 else:
--> 462 raise_exception(exc, tb)
463 res, worker_id = loads(res_info)
464 state['cache'][key] = res
~\Anaconda3\envs\mono\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
109 def reraise(exc, tb=None):
110 if exc.__traceback__ is not tb:
--> 111 raise exc.with_traceback(tb)
112 raise exc
113
~\Anaconda3\envs\mono\lib\site-packages\dask\local.py in execute_task()
228 try:
229 task, data = loads(task_info)
--> 230 result = _execute_task(task, data)
231 id = get_id()
232 result = dumps((result, id))
~\Anaconda3\envs\mono\lib\site-packages\dask\core.py in _execute_task()
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
~\Anaconda3\envs\mono\lib\site-packages\dask\optimization.py in __call__()
940 % (len(self.inkeys), len(args)))
941 return core.get(self.dsk, self.outkey,
--> 942 dict(zip(self.inkeys, args)))
943
944 def __reduce__(self):
~\Anaconda3\envs\mono\lib\site-packages\dask\core.py in get()
147 for key in toposort(dsk):
148 task = dsk[key]
--> 149 result = _execute_task(task, cache)
150 cache[key] = result
151 result = _execute_task(out, cache)
~\Anaconda3\envs\mono\lib\site-packages\dask\core.py in _execute_task()
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
~\Anaconda3\envs\mono\lib\site-packages\dask\compatibility.py in apply()
91 def apply(func, args, kwargs=None):
92 if kwargs:
---> 93 return func(*args, **kwargs)
94 else:
95 return func(*args)
~\Anaconda3\envs\mono\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3877 func = kwargs.pop('_func')
3878 meta = kwargs.pop('_meta')
-> 3879 df = func(*args, **kwargs)
3880 if is_dataframe_like(df) or is_series_like(df) or is_index_like(df):
3881 if not len(df):
<ipython-input-69-f39ab0d086b5> in <lambda>()
1 from dask import dataframe as dd
----> 2 sd = dd.from_pandas(r13_2["endereco2"],npartitions=3).map_partitions(lambda df : df.apply(process.extractOne,choices=choices,scorer=fuzz.token_set_ratio,score_cutoff=60)).compute(scheduler='processes')
~\Anaconda3\envs\mono\lib\site-packages\pandas\core\series.py in apply()
3589 else:
3590 values = self.astype(object).values
-> 3591 mapped = lib.map_infer(values, f, convert=convert_dtype)
3592
3593 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
~\Anaconda3\envs\mono\lib\site-packages\pandas\core\series.py in f()
3576 if kwds or args and not isinstance(func, np.ufunc):
3577 def f(x):
-> 3578 return func(x, *args, **kwds)
3579 else:
3580 f = func
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\process.py in extractOne()
218 best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
219 try:
--> 220 return max(best_list, key=lambda i: i[1])
221 except ValueError:
222 return None
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\process.py in extractWithoutOrder()
76
77 # Run the processor on the input query.
---> 78 processed_query = processor(query)
79
80 if len(processed_query) == 0:
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\utils.py in full_process()
93 s = asciidammit(s)
94 # Keep only Letters and Numbers (see Unicode docs).
---> 95 string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
96 # Force into lowercase.
97 string_out = StringProcessor.to_lower_case(string_out)
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\string_processing.py in replace_non_letters_non_numbers_with_whitespace()
24 numbers with a single white space.
25 """
---> 26 return cls.regex.sub(" ", a_string)
27
28 strip = staticmethod(string.strip)
TypeError: expected string or bytes-like object
What's happenig?
Obs: I solved my problem using the pool.apply from multplocessing lib, but i still want to know what happened with Dask
Doing the MCVE I realized that it was a naive syntax problem: I can't use the map_partitions on a dask dataframe without specifying the column that im using even if there is only one column. So I should had used sd[0].map_partitions insted of sd.map_partitions
I'm trying to take an integer column and map discrete values to another column. Basically, if a credit tier is marked, 1, 2, 3, antoher column maps those to no credit state, no hit or thin files. Then fill the null values with vaild. I tried However, I keep getting this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-129-926e6625f2b6> in <module>
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6012 args=args,
6013 kwds=kwds)
-> 6014 return op.get_result()
6015
6016 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
<ipython-input-129-926e6625f2b6> in <lambda>(row)
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
<ipython-input-126-462888d46184> in discrete_credit(row, variable)
6
7 """
----> 8 score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
9 score = row[score].fillna('valid')
10 score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
AttributeError: ("'numpy.int64' object has no attribute 'map'", 'occurred at index 0')
Here is a code example that is throwing the same error:
import pandas as pd
credit = {'credit_52278':[1,2,3,500,550,600,650,700,750,800,900]
}
df = pd.DataFrame(credit)
def discrete_credit(row, variable):
"""
allows thin files, no hits and no credit scores to float which will then allow the rest of the credit score to be fit \
with a spline
"""
score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
score = row[score].fillna('valid')
score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
return score
df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
map is a Series method, but you are trying to use it on a scalar (float) value.
You could simply do something like:
df['discrete_52278'] = (
df['credit_52278']
.map({
1: 'no_credit_state',
2: 'thin_file',
3: 'no_hit'
})
.fillna('valid')
.astype('category')
)
I am trying to use plotnine to build graphs and I keep coming across the same KeyError problem when I want to plot just the x-axis. See the traceback error below.
A sample of my data is:
WORD TAG TOPIC Value
0 hey aa 1 234
1 working bb 1 123
2 lullaby cc 2 32
3 Doggy cc 2 63
4 document aa 3 84
sample of my code:
from plotnine import *
import pandas as pd
inFile = 'infile.csv'
df = pd.read_csv(inFile, names = ['WORD', 'TAG','TOPIC','VALUE'], header=0,sep='\t')
df.sort_values('value',ascending=False)
sortedDf = df[:5]
plot1 = ggplot(sortedDf) + aes(x='TOPIC') + geom_histogram(binwidth=3)
where the final goal is to plot the count of each topic in a histogram.
I am not sure what data is missing that is raising the following key error, as there is no need for a weight as I am only interested in plotting the count of that one particular variable, ie. topic 1 = 2, topic 2= 2, topic 3 = 1.
Does anyone have any link to more detailled documentation of plotline or any experience with the library to help me understand more in detail what I am missing.
Traceback Error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-112-71707b4cf21a> in <module>()
1 plot2 = ggplot(sortedDf) + aes(x='TOPIC') + geom_histogram(binwidth=3)
----> 2 print plot2
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in __repr__(self)
82 Print/show the plot
83 """
---> 84 self.draw()
85 plt.show()
86 return '<ggplot: (%d)>' % self.__hash__()
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in draw(self)
139 # assign a default theme
140 self = deepcopy(self)
--> 141 self._build()
142
143 # If no theme we use the default
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in _build(self)
235
236 # Apply and map statistics
--> 237 layers.compute_statistic(layout)
238 layers.map_statistic(self)
239
/Users/anaconda/lib/python2.7/site-packages/plotnine/layer.pyc in compute_statistic(self, layout)
92 def compute_statistic(self, layout):
93 for l in self:
---> 94 l.compute_statistic(layout)
95
96 def map_statistic(self, plot):
/Users/anaconda/lib/python2.7/site-packages/plotnine/layer.pyc in compute_statistic(self, layout)
369 data = self.stat.use_defaults(data)
370 data = self.stat.setup_data(data)
--> 371 data = self.stat.compute_layer(data, params, layout)
372 self.data = data
373
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in compute_layer(cls, data, params, layout)
194 return cls.compute_panel(pdata, pscales, **params)
195
--> 196 return groupby_apply(data, 'PANEL', fn)
197
198 #classmethod
/Users/anaconda/lib/python2.7/site-packages/plotnine/utils.pyc in groupby_apply(df, cols, func, *args, **kwargs)
615 # do not mark d as a slice of df i.e no SettingWithCopyWarning
616 d.is_copy = None
--> 617 lst.append(func(d, *args, **kwargs))
618 return pd.concat(lst, axis=axis, ignore_index=True)
619
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in fn(pdata)
192 return pdata
193 pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 194 return cls.compute_panel(pdata, pscales, **params)
195
196 return groupby_apply(data, 'PANEL', fn)
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in compute_panel(cls, data, scales, **params)
221 for _, old in data.groupby('group'):
222 old.is_copy = None
--> 223 new = cls.compute_group(old, scales, **params)
224 unique = uniquecols(old)
225 missing = unique.columns.difference(new.columns)
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat_bin.pyc in compute_group(cls, data, scales, **params)
107 new_data = assign_bins(
108 data['x'], breaks, data.get('weight'),
--> 109 params['pad'], params['closed'])
110 return new_data
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/binning.pyc in assign_bins(x, breaks, weight, pad, closed)
163 df = pd.DataFrame({'bin_idx': bin_idx, 'weight': weight})
164 wftable = df.pivot_table(
--> 165 'weight', index=['bin_idx'], aggfunc=np.sum)['weight']
166
167 # Empty bins get no value in the computed frequency table.
/Users/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
601 result = self.index.get_value(self, key)
602
--> 603 if not is_scalar(result):
604 if is_list_like(result) and not isinstance(result, Series):
605
/Users/anaconda/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_value(self, series, key)
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3557)()
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3240)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4363)()
KeyError: 'weight'
Nesting aes in ggplot like it is done in R may solve your issue:
plot1 = ggplot(sortedDf, aes(x='TOPIC')) + geom_histogram(binwidth=3)