Convert comma separator objects to numeric in Pandas

Convert comma separator objects to numeric in Pandas - python

I have a table with columns of data type object and int.
One of them is dollar amount with dollar sign($) and comma separator. I would like to use describe() to summarise the dataframe so I tried to read the file by taking into account the $ sign, then convert the object into integer:
df= pd.read_excel(r'C:\Users\xxxx\df.xlsx','my_df' ,engine="openpyxl", thousands=',')
df['my_col'] = df['my_col'].replace({'\$':''}, regex = True)
df['my_col'].astype(str).astype(int)
df.describe(datetime_is_numeric=True)
but it caught error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-133-2011d1ad889e> in <module>
4
5 df['my_col'] = df['my_col'].replace({'\$':''}, regex = True)
----> 6 df['my_col'].astype(str).astype(int)
7 df.describe(datetime_is_numeric=True)
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5535 else:
5536 # else, only a single dtype is given
-> 5537 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
5538 return self._constructor(new_data).__finalize__(self, method="astype")
5539
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
565 self, dtype, copy: bool = False, errors: str = "raise"
566 ) -> "BlockManager":
--> 567 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
568
569 def convert(
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\internals\managers.py in apply(self, f, align_keys, **kwargs)
394 applied = b.apply(f, **kwargs)
395 else:
--> 396 applied = getattr(b, f)(**kwargs)
397 result_blocks = _extend_blocks(applied, result_blocks)
398
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
588 vals1d = values.ravel()
589 try:
--> 590 values = astype_nansafe(vals1d, dtype, copy=True)
591 except (ValueError, TypeError):
592 # e.g. astype_nansafe can fail on object-dtype of strings
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
964 # work around NumPy brokenness, #1987
965 if np.issubdtype(dtype.type, np.integer):
--> 966 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
967
968 # if we have a datetime/timedelta array of objects
pandas\_libs\lib.pyx in pandas._libs.lib.astype_intsafe()
ValueError: invalid literal for int() with base 10: '500.00'
If I were to change df['my_col'].astype(str).astype(int) to df['my_col'].astype(str).astype(float), it would catch the error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-134-65da7cbc042f> in <module>
4
5 df['my_col'] = df['my_col'].replace({'\$':''}, regex = True)
----> 6 df['my_col'].astype(str).astype(int)
7 df.describe(datetime_is_numeric=True)
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5535 else:
5536 # else, only a single dtype is given
-> 5537 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
5538 return self._constructor(new_data).__finalize__(self, method="astype")
5539
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
565 self, dtype, copy: bool = False, errors: str = "raise"
566 ) -> "BlockManager":
--> 567 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
568
569 def convert(
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\internals\managers.py in apply(self, f, align_keys, **kwargs)
394 applied = b.apply(f, **kwargs)
395 else:
--> 396 applied = getattr(b, f)(**kwargs)
397 result_blocks = _extend_blocks(applied, result_blocks)
398
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
588 vals1d = values.ravel()
589 try:
--> 590 values = astype_nansafe(vals1d, dtype, copy=True)
591 except (ValueError, TypeError):
592 # e.g. astype_nansafe can fail on object-dtype of strings
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
987 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
988 # Explicit copy, or required since NumPy can't view from / to object.
--> 989 return arr.astype(dtype, copy=True)
990
991 return arr.view(dtype)
ValueError: could not convert string to float: '5,000.00'

Change replace adding one more condition
df['my_col'] = df['my_col'].replace({'\$':'',',':''}, regex = True)

Related

could not convert string to float - object type

I'm working with a dataframe in Python using Pandas and Jupyter Notebook, and my dataframe has Longitude and Latitude columns with values like '-23,4588'. Somehow, everytime I try to convert it to float, I get an error telling 'could not convert string to float'.
I tried to change the comma, tried to change the .csv column type to float, but nothing works.
A part of my code:
ValueError Traceback (most recent call last)
C:\TEMP/ipykernel_12640/4061618161.py in <module>
----> 1 newocorr_sjc['Latitude'] = newocorr_sjc['Latitude'].astype(float)
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5875 else:
5876 # else, only a single dtype is given
-> 5877 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
5878 return self._constructor(new_data).__finalize__(self, method="astype")
5879
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
629 self, dtype, copy: bool = False, errors: str = "raise"
630 ) -> "BlockManager":
--> 631 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
632
633 def convert(
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
425 applied = b.apply(f, **kwargs)
426 else:
--> 427 applied = getattr(b, f)(**kwargs)
428 except (TypeError, NotImplementedError):
429 if not ignore_failures:
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
671 vals1d = values.ravel()
672 try:
--> 673 values = astype_nansafe(vals1d, dtype, copy=True)
674 except (ValueError, TypeError):
675 # e.g. astype_nansafe can fail on object-dtype of strings
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
1095 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
1096 # Explicit copy, or required since NumPy can't view from / to object.
-> 1097 return arr.astype(dtype, copy=True)
1098
1099 return arr.view(dtype)
ValueError: could not convert string to float: '-23,5327'```

Maybe you should use decimal=',' as argument of pd.read_csv:
df = pd.read_csv('data.csv', sep=';', decimal=',')
>>> df.select_dtypes(float)
17 22 23
0 17.5 -23.5327 -46.8182
1 56.3 -23.4315 -47.1269

ValueError: could not convert string to float: '571,2'

I wrote this code: df['Liquid Milk'] = df['Liquid Milk'].replace("", np.nan).astype('float64')
I got an error below, not sure where is an error, have tried many different way, but still same error. Any help, appreciated.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-52-607dcacd5a1a> in <module>
----> 1 m['Liquid Milk(Mil Litres)']=m['Liquid Milk(Mil Litres)'].replace("", np.nan).astype('float64')
2
3
4
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5679 # else, only a single dtype is given
5680 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5681 **kwargs)
5682 return self._constructor(new_data).__finalize__(self)
5683
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float: '571,2'

A float number's integer and fraction part must be separated by a . not a ,. Replace all ,s with .s.
float("571.2")
would work
float("571,2")
would fail.

TypeError: Cannot cast IntervalArray to dtype float64 when transform(pd.qcut,x)

Here's my dataset
result score
1 0.786
1 0.896
0 0.435
1 0.563
0 0.145
Here's my code
import pandas as pd
intervals = data.groupby('result')['score'].transform(pd.qcut, 10)
Here's the error
TypeError Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/pandas/core/arrays/interval.py in astype(self, dtype, copy)
708 try:
--> 709 return np.asarray(self).astype(dtype, copy=copy)
710 except (TypeError, ValueError) as err:
TypeError: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-88-429b3ee3f973> in <module>
1 data['score'] = pd.to_numeric(data['score'])
----> 2 intervals = data.groupby('result')['score'].transform(pd.qcut, 10)
3 data['Bin_low'] = pd.IntervalIndex(intervals).left
4 data['Bin_high'] = pd.IntervalIndex(intervals).right
/opt/conda/lib/python3.8/site-packages/pandas/core/groupby/generic.py in transform(self, func, engine, engine_kwargs, *args, **kwargs)
491
492 if not isinstance(func, str):
--> 493 return self._transform_general(
494 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
495 )
/opt/conda/lib/python3.8/site-packages/pandas/core/groupby/generic.py in _transform_general(self, func, engine, engine_kwargs, *args, **kwargs)
557 dtype = self._selected_obj.dtype
558 if is_numeric_dtype(dtype):
--> 559 result = maybe_downcast_to_dtype(result, dtype)
560
561 result.name = self._selected_obj.name
/opt/conda/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in maybe_downcast_to_dtype(result, dtype)
150 dtype = np.dtype(dtype)
151
--> 152 converted = maybe_downcast_numeric(result, dtype, do_round)
153 if converted is not result:
154 return converted
/opt/conda/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in maybe_downcast_numeric(result, dtype, do_round)
250 and not is_string_dtype(result.dtype)
251 ):
--> 252 return result.astype(dtype)
253
254 return result
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5544 else:
5545 # else, only a single dtype is given
-> 5546 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
5547 return self._constructor(new_data).__finalize__(self, method="astype")
5548
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
593 self, dtype, copy: bool = False, errors: str = "raise"
594 ) -> "BlockManager":
--> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
596
597 def convert(
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
404 applied = b.apply(f, **kwargs)
405 else:
--> 406 applied = getattr(b, f)(**kwargs)
407 result_blocks = _extend_blocks(applied, result_blocks)
408
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
568 if self.is_extension:
569 try:
--> 570 values = self.values.astype(dtype)
571 except (ValueError, TypeError):
572 if errors == "ignore":
/opt/conda/lib/python3.8/site-packages/pandas/core/arrays/interval.py in astype(self, dtype, copy)
710 except (TypeError, ValueError) as err:
711 msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
--> 712 raise TypeError(msg) from err
713
714 #classmethod
TypeError: Cannot cast IntervalArray to dtype float64
What should I do to cast the IntervalArray?

Since qcut returns a Series with same indexing, you can just use apply, which works fine:
intervals = df.groupby('result')['score'].apply(pd.qcut, 10)
Output:
0 (0.741, 0.786]
1 (0.874, 0.896]
2 (0.406, 0.435]
3 (0.5619999999999999, 0.608]
4 (0.144, 0.174]
Name: score, dtype: interval

The astype function is not working mysteriously

so Im trying to transforming this values in a float to be able to sum(). The problem is there is something weird that wont let me accomplish it
Data:
cw= pd.DataFrame({ "campaign": "151515151515" ,
"Media_Cost": "$ 14,52" })
cw.dtypes
Media_Cost object
My attempts,
I tried all lines of code bellow, one at the time, neither works mysteriously..
cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','')
# Attempt 1
cw.Media_Cost = cw.Media_Cost.astype(float)
# Attempt 3
cw.Media_Cost = len(float(cw.Media_Cost))
# Attempt 4
cw.Media_Cost = cw.Media_Cost.apply(lambda x: float(cw.Media_Cost))
Error persist..
cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','').str.replace(',', '.').astype(float)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-382-f5688d76abed> in <module>
1 # cw.Media_Cost = cw.Media_Cost.apply(lambda x: float(cw.Media_Cost))
----> 2 cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','').str.replace(',', '.').astype(float)
3
4 # cw.Media_Cost = float(cw.Media_Cost)
5
~\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float: '1.443.48'

You can try:
cw = pd.DataFrame({"campaign": "151515151515", "Media_Cost": "$ 1,443.48" }, index=[0])
cw["Media_Cost"] = cw["Media_Cost"].str.replace('$','').str.replace(',', '').astype(float)
cw.dtypes
Result:
campaign object
Media_Cost float64
dtype: object

Why aren't my objects being converted to strings?

Starting with a python object, I'm getting an error when I try to convert the string to a float using astype(string).astype(float).
I've used regular expressions to remove the units and spaces and removed rows with NA.
df['Length'] = df['Length'].astype(str).astype(float)
ValueError Traceback (most recent call last)
<ipython-input-137-724df1c0091a> in <module>
1 df['Length'] = df['Length'].astype(str).astype(float)
2 #df['Length'].astype(str).astype(float)
3 #df['Width'].astype(str).astype(float)
/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
/anaconda3/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float:

As John pointed out the error is on converting string to float.
To visually check for empty strings use df['Length'] == ''.
To count the number of empty strings use: sum(df['Length'] == '')
To drop the rows with empty strings use: df = df[df['Length'] != '']. This will modify your whole data frame and not just df['Length'].
Hope that helps.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Convert comma separator objects to numeric in Pandas - python

Change replace adding one more condition df['my_col'] = df['my_col'].replace({'\$':'',',':''}, regex = True)

Related

could not convert string to float - object type

ValueError: could not convert string to float: '571,2'

TypeError: Cannot cast IntervalArray to dtype float64 when transform(pd.qcut,x)

The astype function is not working mysteriously

Why aren't my objects being converted to strings?

Categories

Resources