I'm trying to take an integer column and map discrete values to another column. Basically, if a credit tier is marked, 1, 2, 3, antoher column maps those to no credit state, no hit or thin files. Then fill the null values with vaild. I tried However, I keep getting this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-129-926e6625f2b6> in <module>
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6012 args=args,
6013 kwds=kwds)
-> 6014 return op.get_result()
6015
6016 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
<ipython-input-129-926e6625f2b6> in <lambda>(row)
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
<ipython-input-126-462888d46184> in discrete_credit(row, variable)
6
7 """
----> 8 score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
9 score = row[score].fillna('valid')
10 score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
AttributeError: ("'numpy.int64' object has no attribute 'map'", 'occurred at index 0')
Here is a code example that is throwing the same error:
import pandas as pd
credit = {'credit_52278':[1,2,3,500,550,600,650,700,750,800,900]
}
df = pd.DataFrame(credit)
def discrete_credit(row, variable):
"""
allows thin files, no hits and no credit scores to float which will then allow the rest of the credit score to be fit \
with a spline
"""
score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
score = row[score].fillna('valid')
score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
return score
df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
map is a Series method, but you are trying to use it on a scalar (float) value.
You could simply do something like:
df['discrete_52278'] = (
df['credit_52278']
.map({
1: 'no_credit_state',
2: 'thin_file',
3: 'no_hit'
})
.fillna('valid')
.astype('category')
)
Related
I have below dataset
I want to perform mean operation on 'horsepower' column after doing group by on column 'cylinders' and 'model year' using panda. I am running code in jupyter notebook.
Below is my code:
df = pd.read_csv('auto_mpg.csv')
df.groupby(['cylinders','model year']).agg({'horsepower':'mean'})
Basically, I am performing first group by on column 'cylinders' and 'model year' and then performing aggregation operation to get mean value.
I am getting below error:
DataError Traceback (most recent call last)
<ipython-input-105-967f7e0151c3> in <module>
2 #Creating a DataFrame grouped on cylinders and model_year and finding mean, min and max of horsepower
3 df = pd.read_csv('auto_mpg.csv')
----> 4 df.groupby(['cylinders','model year']).agg({'horsepower':['mean']})
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
949 func = maybe_mangle_lambdas(func)
950
--> 951 result, how = self._aggregate(func, *args, **kwargs)
952 if how is None:
953 return result
~\anaconda3\lib\site-packages\pandas\core\base.py in _aggregate(self, arg, *args, **kwargs)
414
415 try:
--> 416 result = _agg(arg, _agg_1dim)
417 except SpecificationError:
418
~\anaconda3\lib\site-packages\pandas\core\base.py in _agg(arg, func)
381 result = {}
382 for fname, agg_how in arg.items():
--> 383 result[fname] = func(fname, agg_how)
384 return result
385
~\anaconda3\lib\site-packages\pandas\core\base.py in _agg_1dim(name, how, subset)
365 "nested dictionary is ambiguous in aggregation"
366 )
--> 367 return colg.aggregate(how)
368
369 def _agg_2dim(how):
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
244 # but not the class list / tuple itself.
245 func = maybe_mangle_lambdas(func)
--> 246 ret = self._aggregate_multiple_funcs(func)
247 if relabeling:
248 ret.columns = columns
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _aggregate_multiple_funcs(self, arg)
317 obj._reset_cache()
318 obj._selection = name
--> 319 results[base.OutputKey(label=name, position=idx)] = obj.aggregate(func)
320
321 if any(isinstance(x, DataFrame) for x in results.values()):
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
238
239 if isinstance(func, str):
--> 240 return getattr(self, func)(*args, **kwargs)
241
242 elif isinstance(func, abc.Iterable):
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in mean(self, numeric_only)
1391 Name: B, dtype: float64
1392 """
-> 1393 return self._cython_agg_general(
1394 "mean",
1395 alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
1049
1050 if len(output) == 0:
-> 1051 raise DataError("No numeric types to aggregate")
1052
1053 return self._wrap_aggregated_output(output, index=self.grouper.result_index)
DataError: No numeric types to aggregate
While I get min and max aggregation on 'horsepower' column successfully.
df = pd.read_csv('auto_mpg.csv')
df.groupby(['cylinders','model year']).agg({'horsepower':['min','max']})
I loaded the auto-mpg the dataset from https://www.kaggle.com/uciml/autompg-dataset/version/3nd
and managed to replicate the problem.
The root cause is that horsepower column is loaded as type object with missing values represented as question mark strings (?), for example:
df[df.horsepower.str.contains("\?")]
Pandas doesn't know how to take the mean of question marks, so the solution would be casting the column to float:
# Convert non digit strings to NaN
df.loc[~df.horsepower.str.isdigit(), "horsepower"] = np.NaN
# Cast to float
df.horsepower = df.horsepower.astype("float")
# Aggregate
df.groupby(["cylinders", "model year"]).agg({"horsepower": "mean"})
Used pandas==1.1.5 and numpy==1.19.5.
Check the data type. I see the root cause error at the bottom of your post:
raise DataError("No numeric types to aggregate")
Put that ‘mean’ into bracket then, if data type is right:
agg({'horsepower': ['mean']})
Try this
df = pd.read_csv('auto_mpg.csv')
df.groupby(['cylinders','model year']).mean()["horsepower]
df.groupby(['cylinders','model year']).mean() will give you the mean of each column and then you are selecting the horsepower variable to get the desired columns from the df on which groupby and mean operations were performed.
I'm playing around with Python Dask. I followed their dataframe example jupyter notebook but failed at the step when converting a dask dataframe to pandas data frame by calling the compute() function. Would anyone please advise what I did wrong?
Code:
### Cell0
!pip install "dask[complete]"
!pip install pandas
### Cell1
import dask
import dask.dataframe as dd
df = dask.datasets.timeseries()
df
### Cell2
df2 = df[df.y > 0]
df3 = df2.groupby('name').x.std()
df3
### Cell3
computed_df = df3.compute()
type(computed_df)
Error raised when executing computed_df = df3.compute() in cell 3.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-6-6da1eef50c1d> in <module>
----> 1 computed_df = df3.compute()
2 type(computed_df)
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs)
283 dask.base.compute
284 """
--> 285 (result,) = compute(self, traverse=False, **kwargs)
286 return result
287
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/base.py in compute(*args, **kwargs)
559 )
560
--> 561 dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
562 keys, postcomputes = [], []
563 for x in collections:
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/base.py in collections_to_dsk(collections, optimize_graph, optimizations, **kwargs)
335 for opt, val in groups.items():
336 dsk, keys = _extract_graph_and_keys(val)
--> 337 dsk = opt(dsk, keys, **kwargs)
338
339 for opt in optimizations:
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/optimize.py in optimize(dsk, keys, **kwargs)
20 else:
21 # Perform Blockwise optimizations for HLG input
---> 22 dsk = optimize_dataframe_getitem(dsk, keys=keys)
23 dsk = optimize_blockwise(dsk, keys=keys)
24 dsk = fuse_roots(dsk, keys=keys)
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/optimize.py in optimize_dataframe_getitem(dsk, keys)
103 # Project columns and update blocks
104 old = layers[k]
--> 105 new = old.project_columns(columns)[0]
106 if new.name != old.name:
107 columns = list(columns)
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/layers.py in project_columns(self, columns)
941 # Apply column projection in IO function
942 try:
--> 943 io_func = self.io_func.project_columns(list(columns))
944 except AttributeError:
945 io_func = self.io_func
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/io/demo.py in project_columns(self, columns)
87 func = copy.deepcopy(self)
88 func.columns = columns
---> 89 func.dtypes = {c: self.dtypes[c] for c in columns}
90 return func
91
~/.pyenv/versions/3.9.0/lib/python3.9/site-packages/dask/dataframe/io/demo.py in <dictcomp>(.0)
87 func = copy.deepcopy(self)
88 func.columns = columns
---> 89 func.dtypes = {c: self.dtypes[c] for c in columns}
90 return func
91
KeyError: 'gt-d5f81fc97f91e68c389fc34631419acc'
Interesting, I can reproduce this bug with:
python=3.9.4
pandas=1.2.4
dask=2021.5.0
distributed=2021.5.0
Specifically, the error occurs in this step:
df2 = df[df.y > 0]
I raised an issue on GitHub, but in the meantime downgrading dask version to 2021.4.0 resolves the problem (the computed result will show):
python=3.9.4
pandas=1.2.4
dask=2021.4.1
distributed=2021.4.1
(note Python here is 3.9, which seems to be the case in your environment)
I am having issues with the loc API of xarray. I am trying to select data that satisfy a certain condition. Below should be a reproducible example:
import xarray as xr
da = xr.tutorial.load_dataset('air_temperature').air
mask = 100*xr.ones_like(da[0])
da[0].loc[da[0]<mask]
but this gives the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-238-75b6b6f544d4> in <module>
1 da = xr.tutorial.load_dataset('air_temperature').air
2 mask = 100*xr.ones_like(da[0])
----> 3 da[0].loc[da[0]<mask]
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/dataarray.py in __getitem__(self, key)
194 labels = indexing.expanded_indexer(key, self.data_array.ndim)
195 key = dict(zip(self.data_array.dims, labels))
--> 196 return self.data_array.sel(**key)
197
198 def __setitem__(self, key, value) -> None:
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/dataarray.py in sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
1045 method=method,
1046 tolerance=tolerance,
-> 1047 **indexers_kwargs
1048 )
1049 return self._from_temp_dataset(ds)
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/dataset.py in sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
1998 indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "sel")
1999 pos_indexers, new_indexes = remap_label_indexers(
-> 2000 self, indexers=indexers, method=method, tolerance=tolerance
2001 )
2002 result = self.isel(indexers=pos_indexers, drop=drop)
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/coordinates.py in remap_label_indexers(obj, indexers, method, tolerance, **indexers_kwargs)
390
391 pos_indexers, new_indexes = indexing.remap_label_indexers(
--> 392 obj, v_indexers, method=method, tolerance=tolerance
393 )
394 # attach indexer's coordinate to pos_indexers
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/indexing.py in remap_label_indexers(data_obj, indexers, method, tolerance)
259 coords_dtype = data_obj.coords[dim].dtype
260 label = maybe_cast_to_coords_dtype(label, coords_dtype)
--> 261 idxr, new_idx = convert_label_indexer(index, label, dim, method, tolerance)
262 pos_indexers[dim] = idxr
263 if new_idx is not None:
~/miniconda3/envs/ensemble/lib/python3.7/site-packages/xarray/core/indexing.py in convert_label_indexer(index, label, index_name, method, tolerance)
191 indexer = get_indexer_nd(index, label, method, tolerance)
192 if np.any(indexer < 0):
--> 193 raise KeyError("not all values found in index %r" % index_name)
194 return indexer, new_index
195
KeyError: "not all values found in index 'lat'"
Since the mask is identical to the xarray.DataArray da in terms of dimensions and coordinates, I don't think this error makes sense... Am I missing something or is this possibly a bug?
Thank you in advance for your help.
I'm trying to map a dictionary value to a dataset in a fuction. I keep getting the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-114-f1360d45f8fc> in <module>
----> 1 df['unit_value_factor_4'] = df.apply(map_value, axis=1)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6012 args=args,
6013 kwds=kwds)
-> 6014 return op.get_result()
6015
6016 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
<ipython-input-113-2ec7fc46c34e> in map_value(row)
2 def map_value(row):
3 if row['RATING_CLASS_CODE'] == 'G':
----> 4 val = row['unit_value_model'].map(g_cn_value)
5
6 elif row['RATING_CLASS_CODE'] == 'CN':
AttributeError: ("'float' object has no attribute 'map'", 'occurred at index 40')
Below is the function. This is simply looking up the RATING_CLASS_CODE on each row, then mapping a value from a dictionary that corresponds to the unit_value_model which matches my dictionary key.
def map_value(row):
if row['RATING_CLASS_CODE'] == 'G':
val = row['unit_value_model'].map(g_cn_value)
elif row['RATING_CLASS_CODE'] == 'CN':
val = row['unit_value_model'].map(g_cn_value)
elif row['RATING_CLASS_CODE'] == 'NE':
val = row['unit_value_model'].map(ne_gv_value)
elif row['RATING_CLASS_CODE'] == 'GV':
val = row['unit_value_model'].map(ne_gv_value)
elif row['RATING_CLASS_CODE'] == 'LA':
val = row['unit_value_model'].map(la_coll_value)
else:
val = None
print(val)
return val
df['unit_value_factor_4'] = df.apply(map_value, axis=1)
I thnk you need np.select with multiple conditions.
Look at this answer for an explicit example.
I am trying to use plotnine to build graphs and I keep coming across the same KeyError problem when I want to plot just the x-axis. See the traceback error below.
A sample of my data is:
WORD TAG TOPIC Value
0 hey aa 1 234
1 working bb 1 123
2 lullaby cc 2 32
3 Doggy cc 2 63
4 document aa 3 84
sample of my code:
from plotnine import *
import pandas as pd
inFile = 'infile.csv'
df = pd.read_csv(inFile, names = ['WORD', 'TAG','TOPIC','VALUE'], header=0,sep='\t')
df.sort_values('value',ascending=False)
sortedDf = df[:5]
plot1 = ggplot(sortedDf) + aes(x='TOPIC') + geom_histogram(binwidth=3)
where the final goal is to plot the count of each topic in a histogram.
I am not sure what data is missing that is raising the following key error, as there is no need for a weight as I am only interested in plotting the count of that one particular variable, ie. topic 1 = 2, topic 2= 2, topic 3 = 1.
Does anyone have any link to more detailled documentation of plotline or any experience with the library to help me understand more in detail what I am missing.
Traceback Error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-112-71707b4cf21a> in <module>()
1 plot2 = ggplot(sortedDf) + aes(x='TOPIC') + geom_histogram(binwidth=3)
----> 2 print plot2
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in __repr__(self)
82 Print/show the plot
83 """
---> 84 self.draw()
85 plt.show()
86 return '<ggplot: (%d)>' % self.__hash__()
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in draw(self)
139 # assign a default theme
140 self = deepcopy(self)
--> 141 self._build()
142
143 # If no theme we use the default
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in _build(self)
235
236 # Apply and map statistics
--> 237 layers.compute_statistic(layout)
238 layers.map_statistic(self)
239
/Users/anaconda/lib/python2.7/site-packages/plotnine/layer.pyc in compute_statistic(self, layout)
92 def compute_statistic(self, layout):
93 for l in self:
---> 94 l.compute_statistic(layout)
95
96 def map_statistic(self, plot):
/Users/anaconda/lib/python2.7/site-packages/plotnine/layer.pyc in compute_statistic(self, layout)
369 data = self.stat.use_defaults(data)
370 data = self.stat.setup_data(data)
--> 371 data = self.stat.compute_layer(data, params, layout)
372 self.data = data
373
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in compute_layer(cls, data, params, layout)
194 return cls.compute_panel(pdata, pscales, **params)
195
--> 196 return groupby_apply(data, 'PANEL', fn)
197
198 #classmethod
/Users/anaconda/lib/python2.7/site-packages/plotnine/utils.pyc in groupby_apply(df, cols, func, *args, **kwargs)
615 # do not mark d as a slice of df i.e no SettingWithCopyWarning
616 d.is_copy = None
--> 617 lst.append(func(d, *args, **kwargs))
618 return pd.concat(lst, axis=axis, ignore_index=True)
619
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in fn(pdata)
192 return pdata
193 pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 194 return cls.compute_panel(pdata, pscales, **params)
195
196 return groupby_apply(data, 'PANEL', fn)
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in compute_panel(cls, data, scales, **params)
221 for _, old in data.groupby('group'):
222 old.is_copy = None
--> 223 new = cls.compute_group(old, scales, **params)
224 unique = uniquecols(old)
225 missing = unique.columns.difference(new.columns)
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat_bin.pyc in compute_group(cls, data, scales, **params)
107 new_data = assign_bins(
108 data['x'], breaks, data.get('weight'),
--> 109 params['pad'], params['closed'])
110 return new_data
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/binning.pyc in assign_bins(x, breaks, weight, pad, closed)
163 df = pd.DataFrame({'bin_idx': bin_idx, 'weight': weight})
164 wftable = df.pivot_table(
--> 165 'weight', index=['bin_idx'], aggfunc=np.sum)['weight']
166
167 # Empty bins get no value in the computed frequency table.
/Users/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
601 result = self.index.get_value(self, key)
602
--> 603 if not is_scalar(result):
604 if is_list_like(result) and not isinstance(result, Series):
605
/Users/anaconda/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_value(self, series, key)
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3557)()
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3240)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4363)()
KeyError: 'weight'
Nesting aes in ggplot like it is done in R may solve your issue:
plot1 = ggplot(sortedDf, aes(x='TOPIC')) + geom_histogram(binwidth=3)