Error while converting categorical feature to numerical feature in Pandas - python

My dataframe contains a categorical feature 'Street' which can take 1 of the 2 possible values 'Grvl' or 'Pave'. I am trying to convert this categorical feature in to numerical value before fitting ML algorithm. My code looks like this
dataset['Street']=dataset['Street'].map({'Grvl':0,'Pave':1}).astype(int)
I have filled missing values with the most occurring value in the dataframe
dataset['Street'].isnull().sum()
I am getting following error
ValueError Traceback (most recent call last)
<ipython-input-59-86f0b031335a> in <module>()
2 print dataset['Street'].isnull().sum()
3 #dataset['MSZoning'] = dataset['MSZoning'].map( {'A': 0, 'C': 1,'FV': 2,'I':3,'RH':4,'RL':5,'RP':6,'RM':7} ).astype(int)
----> 4 dataset['Street']=dataset['Street'].map({'Grvl':0,'Pave':1}).astype(int)
5 dataset['LotShape']=dataset['LotShape'].map({'Reg':0,'IR1':1,'IR2':2,'IR3':3}).astype(int)
6 dataset['LandContour']=dataset['LandContour'].map({'Lvl':0,'Bnk':1,'HLS':2,'Low':3}).astype(int)
C:\Users\JAYASHREE\Anaconda2\lib\site-packages\pandas\core\generic.pyc in astype(self, dtype, copy, raise_on_error, **kwargs)
2948
2949 mgr = self._data.astype(dtype=dtype, copy=copy,
-> 2950 raise_on_error=raise_on_error, **kwargs)
2951 return self._constructor(mgr).__finalize__(self)
2952
C:\Users\JAYASHREE\Anaconda2\lib\site-packages\pandas\core\internals.pyc in astype(self, dtype, **kwargs)
2936
2937 def astype(self, dtype, **kwargs):
-> 2938 return self.apply('astype', dtype=dtype, **kwargs)
2939
2940 def convert(self, **kwargs):
C:\Users\JAYASHREE\Anaconda2\lib\site-packages\pandas\core\internals.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, raw, **kwargs)
2888
2889 kwargs['mgr'] = self
-> 2890 applied = getattr(b, f)(**kwargs)
2891 result_blocks = _extend_blocks(applied, result_blocks)
2892
C:\Users\JAYASHREE\Anaconda2\lib\site-packages\pandas\core\internals.pyc in astype(self, dtype, copy, raise_on_error, values, **kwargs)
432 **kwargs):
433 return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
--> 434 values=values, **kwargs)
435
436 def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
C:\Users\JAYASHREE\Anaconda2\lib\site-packages\pandas\core\internals.pyc in _astype(self, dtype, copy, raise_on_error, values, klass, mgr, **kwargs)
475
476 # _astype_nansafe works fine with 1-d only
--> 477 values = com._astype_nansafe(values.ravel(), dtype, copy=True)
478 values = values.reshape(self.shape)
479
C:\Users\JAYASHREE\Anaconda2\lib\site-packages\pandas\core\common.pyc in _astype_nansafe(arr, dtype, copy)
1912
1913 if np.isnan(arr).any():
-> 1914 raise ValueError('Cannot convert NA to integer')
1915 elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
1916 # work around NumPy brokenness, #1987
ValueError: Cannot convert NA to integer

You have NaN values in your dataframe ! Since You cannot convert series from object to integer (using asType(int)) if there are missing values you should fill missing values before !
dataset['Street'].isnull().sum() dosn't fill missing values
you can do it using pandas.DataFrame.fillna or sklearn.preprocessing.Imputer

Related

ValueError: could not convert string to float: '571,2'

I wrote this code: df['Liquid Milk'] = df['Liquid Milk'].replace("", np.nan).astype('float64')
I got an error below, not sure where is an error, have tried many different way, but still same error. Any help, appreciated.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-52-607dcacd5a1a> in <module>
----> 1 m['Liquid Milk(Mil Litres)']=m['Liquid Milk(Mil Litres)'].replace("", np.nan).astype('float64')
2
3
4
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5679 # else, only a single dtype is given
5680 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5681 **kwargs)
5682 return self._constructor(new_data).__finalize__(self)
5683
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float: '571,2'
A float number's integer and fraction part must be separated by a . not a ,. Replace all ,s with .s.
float("571.2")
would work
float("571,2")
would fail.

Lifetimes package: float() argument must be a string or a number, not 'Day'

Getting the following error while using the summary_data_from_transaction_data utility function included within the Lifestyles python package. Using pandas version 0.2 on Google Colab.
TypeError: float() argument must be a string or a number, not 'Day'
Any help will be much appreciated.
Code:
data_summary = summary_data_from_transaction_data(data_final, customer_id_col = "CustomerID", datetime_col = "InvoiceDate", monetary_value_col = "Sales", observation_period_end = "2011-12-09", freq = "D")
Stacktrace:
/usr/local/lib/python3.6/dist-packages/lifetimes/utils.py in summary_data_from_transaction_data(transactions, customer_id_col, datetime_col, monetary_value_col, datetime_format, observation_period_end, freq)
194 summary_columns.append('monetary_value')
195
--> 196 return customers[summary_columns].astype("float64")
197
198
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5880 # else, only a single dtype is given
5881 new_data = self._data.astype(
-> 5882 dtype=dtype, copy=copy, errors=errors, **kwargs
5883 )
5884 return self._constructor(new_data).__finalize__(self)
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
579
580 def astype(self, dtype, **kwargs):
--> 581 return self.apply("astype", dtype=dtype, **kwargs)
582
583 def convert(self, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
436 kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
437
--> 438 applied = getattr(b, f)(**kwargs)
439 result_blocks = _extend_blocks(applied, result_blocks)
440
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
557
558 def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
--> 559 return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs)
560
561 def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
641 # _astype_nansafe works fine with 1-d only
642 vals1d = values.ravel()
--> 643 values = astype_nansafe(vals1d, dtype, copy=True, **kwargs)
644
645 # TODO(extension)
/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
727 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
728 # Explicit copy, or required since NumPy can't view from / to object.
--> 729 return arr.astype(dtype, copy=True)
730
731 return arr.view(dtype)
TypeError: float() argument must be a string or a number, not 'Day'
Sample data in the data_final df and associated dtypes are as per the attachments.
sample data
dtypes
Thanks for any help.
Apologies folks - I was able to resolve my issue after updating the Lifetimes package to the latest 0.11.1 version in Colab!

The astype function is not working mysteriously

so Im trying to transforming this values in a float to be able to sum(). The problem is there is something weird that wont let me accomplish it
Data:
cw= pd.DataFrame({ "campaign": "151515151515" ,
"Media_Cost": "$ 14,52" })
cw.dtypes
Media_Cost object
My attempts,
I tried all lines of code bellow, one at the time, neither works mysteriously..
cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','')
# Attempt 1
cw.Media_Cost = cw.Media_Cost.astype(float)
# Attempt 3
cw.Media_Cost = len(float(cw.Media_Cost))
# Attempt 4
cw.Media_Cost = cw.Media_Cost.apply(lambda x: float(cw.Media_Cost))
Error persist..
cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','').str.replace(',', '.').astype(float)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-382-f5688d76abed> in <module>
1 # cw.Media_Cost = cw.Media_Cost.apply(lambda x: float(cw.Media_Cost))
----> 2 cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','').str.replace(',', '.').astype(float)
3
4 # cw.Media_Cost = float(cw.Media_Cost)
5
~\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float: '1.443.48'
You can try:
cw = pd.DataFrame({"campaign": "151515151515", "Media_Cost": "$ 1,443.48" }, index=[0])
cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','').str.replace(',', '').astype(float)
cw.dtypes
Result:
campaign object
Media_Cost float64
dtype: object

Why aren't my objects being converted to strings?

Starting with a python object, I'm getting an error when I try to convert the string to a float using astype(string).astype(float).
I've used regular expressions to remove the units and spaces and removed rows with NA.
df['Length'] = df['Length'].astype(str).astype(float)
ValueError Traceback (most recent call last)
<ipython-input-137-724df1c0091a> in <module>
1 df['Length'] = df['Length'].astype(str).astype(float)
2 #df['Length'].astype(str).astype(float)
3 #df['Width'].astype(str).astype(float)
/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
/anaconda3/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float:
As John pointed out the error is on converting string to float.
To visually check for empty strings use df['Length'] == ''.
To count the number of empty strings use: sum(df['Length'] == '')
To drop the rows with empty strings use: df = df[df['Length'] != '']. This will modify your whole data frame and not just df['Length'].
Hope that helps.

Error setting an array element with a sequence, when using Pandas, astype (float)

I have large dataset which i am reading from text file and I want to perform an operation on it. I use
T[fields[0:-1]]=T[fields[0:-1]].astype(float)
to be sure that all the values are float. I get the Error setting an array element with a sequence on one the columns. I changed T.replace('NaN', np.nan) the NaN to nan but still the same issue. I used
dtypeCount =[T.iloc[:,i].apply(type).value_counts() for i in range(T.shape[1])]
to determine the type of the data on that column and this is the results
Name: PD_PRESSURE, dtype: int64, <class 'NoneType'> 3676479
<class 'float'> 192217
Due to size of the dataset I can't figure out where this coming from. Any thought on how I can solve this or how how I can find what is causing this?
Thanks in advance.
Update: Full Error message
ValueError Traceback (most recent call last)
<ipython-input-33-fa0a78194654> in <module>()
162 if Aggregate_Flag==1:
163 # This line make sure that all the data are defined as float
--> 164 T[fields[0:-1]]=T[fields[0:-1]].astype(float)
165 # defining the function inside the loop is not the best practice. However, since the number of iterations
166 #( number of file are small), I put it insider the loop to improve the readibility of the code.
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
116 else:
117 kwargs[new_arg_name] = new_arg_value
--> 118 return func(*args, **kwargs)
119 return wrapper
120 return _deprecate_kwarg
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
4002 # else, only a single dtype is given
4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004 **kwargs)
4005 return self._constructor(new_data).__finalize__(self)
4006
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
3460
3461 def astype(self, dtype, **kwargs):
-> 3462 return self.apply('astype', dtype=dtype, **kwargs)
3463
3464 def convert(self, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3327
3328 kwargs['mgr'] = self
-> 3329 applied = getattr(b, f)(**kwargs)
3330 result_blocks = _extend_blocks(applied, result_blocks)
3331
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
543 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544 **kwargs)
545
546 def _astype(self, dtype, copy=False, errors='raise', values=None,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
623
624 # _astype_nansafe works fine with 1-d only
--> 625 values = astype_nansafe(values.ravel(), dtype, copy=True)
626 values = values.reshape(self.shape)
627
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
701
702 if copy:
--> 703 return arr.astype(dtype)
704 return arr.view(dtype)
705
ValueError: setting an array element with a sequence.
If I exclude column PD_PRESSURE, I don't receive any error.
I also tried T['PD_PRESSURE'].dtype(float) and I get the error but for other columns it works fine.
If I run T[fields[0:-1]]=T[fields[0:-1]] it works fine by itself, based on these I thought probably the error is coming from PD_Pressure column.
T.info() returns
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3868696 entries, 2000-01-01 to 2017-04-11
columns (total 6 columns):
A object
B object
C object
D object
PD_PRESSURE object
F object
dtypes: object(6)
memory usage: 1.0+ GB

Categories

Resources