I am getting the following error and I am not sure what it is. Can you please help me identify what is causing this error?
def determine_device_type(*args, **kwargs):
device_name = str(args[0])
#print(device_name)
if re.search(r'^\w+c(\d\d|\w\w|\d\d\d\d)$', device_name):
print(device_name, ' --> ', 'Cisco')
return 'Cisco'
if re.search(r'^\w+r(\d\d|\w\w|\d\d\d\d)$', device_name):
print(device_name, ' --> ', 'Riverbed')
return 'Riverbed'
if re.search(r'^\w+j(\d\d|\w\w|\d\d\d\d)$', device_name):
print(device_name, ' --> ', 'Juniper')
return 'Juniper'
return ''
vendor = df['DEVICE'].apply(determine_device_type)
df.insert(3, 'VENDOR', vendor)
Error
ValueError Traceback (most recent call last)
<ipython-input-42-8b32a12d4761> in <module>()
21
22 vendor = df['MDN_DEVICE'].apply(determine_device_type)
---> 23 df.insert(3, 'VENDOR', vendor)
24 df
C:\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in insert(self, loc, column, value, allow_duplicates)
2611 value = self._sanitize_column(column, value, broadcast=False)
2612 self._data.insert(loc, column, value,
-> 2613 allow_duplicates=allow_duplicates)
2614
2615 def assign(self, **kwargs):
C:\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in insert(self, loc, item, value, allow_duplicates)
4061 if not allow_duplicates and item in self.items:
4062 # Should this be a different kind of error??
-> 4063 raise ValueError('cannot insert {}, already exists'.format(item))
4064
4065 if not isinstance(loc, int):
ValueError: cannot insert VENDOR, already exists
It seems you already have a 'VENDOR' column. You can use the df.head() to confirm. To avoid the error, drop the column before inserting.
df.drop(columns=['VENDOR']).insert(3, 'VENDOR', vendor)
Related
I inherited this code from previous employee, and I tried to run this code but I'm getting an error.
def replaceitem(x):
if x in ['ORION', 'ACTION', 'ICE', 'IRIS', 'FOCUS']:
return 'CRM Application'
else:
return x
def clean_list(row):
new_list = sorted(set(row['APLN_NM']), key=lambda x: row['APLN_NM'].index(x))
for idx,i in enumerate(new_list):
new_list[idx] = replaceitem(i)
new_list = sorted(set(new_list), key=lambda x: new_list.index(x))
return new_list
#*********************************************************************************************************************************************
df_agg['APLN_NM_DISTINCT'] = df_agg.apply(clean_list, axis = 1)
df_agg_single['APLN_NM_DISTINCT'] = df_agg_single.apply(clean_list, axis = 1)
While running the code I got this error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2890 try:
-> 2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'APLN_NM_DISTINCT'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/generic.py in _set_item(self, key, value)
3570 try:
-> 3571 loc = self._info_axis.get_loc(key)
3572 except KeyError:
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2892 except KeyError as err:
-> 2893 raise KeyError(key) from err
2894
KeyError: 'APLN_NM_DISTINCT'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-71-e8b5e8d5b514> in <module>
431 #*********************************************************************************************************************************************
432 df_agg['APLN_NM_DISTINCT'] = df_agg.apply(clean_list, axis = 1)
--> 433 df_agg_single['APLN_NM_DISTINCT'] = df_agg_single.apply(clean_list, axis = 1)
434
435 df_agg['TOTAL_HOLD_TIME'] = df_agg_single['TOTAL_HOLD_TIME'].astype(int)
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
3038 else:
3039 # set column
-> 3040 self._set_item(key, value)
3041
3042 def _setitem_slice(self, key: slice, value):
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
3115 self._ensure_valid_index(value)
3116 value = self._sanitize_column(key, value)
-> 3117 NDFrame._set_item(self, key, value)
3118
3119 # check if we are modifying a copy
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/generic.py in _set_item(self, key, value)
3572 except KeyError:
3573 # This item wasn't present, just insert at end
-> 3574 self._mgr.insert(len(self._info_axis), key, value)
3575 return
3576
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/internals/managers.py in insert(self, loc, item, value, allow_duplicates)
1187 value = _safe_reshape(value, (1,) + value.shape)
1188
-> 1189 block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))
1190
1191 for blkno, count in _fast_count_smallints(self.blknos[loc:]):
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/internals/blocks.py in make_block(values, placement, klass, ndim, dtype)
2717 values = DatetimeArray._simple_new(values, dtype=dtype)
2718
-> 2719 return klass(values, ndim=ndim, placement=placement)
2720
2721
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/internals/blocks.py in __init__(self, values, placement, ndim)
2373 values = np.array(values, dtype=object)
2374
-> 2375 super().__init__(values, ndim=ndim, placement=placement)
2376
2377 #property
/opt/rh/rh-python36/root/usr/lib64/python3.6/site-packages/pandas/core/internals/blocks.py in __init__(self, values, placement, ndim)
128 if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
129 raise ValueError(
--> 130 f"Wrong number of items passed {len(self.values)}, "
131 f"placement implies {len(self.mgr_locs)}"
132 )
ValueError: Wrong number of items passed 3, placement implies 1
df_agg and df_agg_single are dataframes with same column names.
But the data is present only in df_agg
data in df_agg dataframe looks like this
data in df_agg_single dataframe looks like this
so if the data frame is empty I am getting this type of error while applying clean_list method on the data frame.
I identified the error is occurring only if the data frame is empty, so I tried if else to filter the empty data frame and it worked.
if df_agg.empty:
df_agg['APLN_NM_DISTINCT'] = ''
else:
df_agg['APLN_NM_DISTINCT'] = df_agg.apply(clean_list, axis = 1)
if df_agg_single.empty:
df_agg_single['APLN_NM_DISTINCT'] = ''
else:
df_agg_single['APLN_NM_DISTINCT'] = df_agg_single.apply(clean_list, axis = 1)
I want to create a function that basically pulls two specific values or "cells" from my data frame. The data frame is a little large so I attached a picture of it. My current code for the function
def return_riskscore(CountryA, CountryB):
i = CountryA
j = CountryB
senti_value = main7.at[i,j]
return senti_value
Then I used the following to apply the function to the dataframe.
main7["Value"] = main7.apply(return_riskscore(CountryA = 'Ukraine ', CountryB = 'Russia'))
The error I receive is as follows:
KeyError Traceback (most recent call last)
/var/folders/bl/qq18pm0d3kv24sfpjxbb_d2w0000gn/T/ipykernel_29770/4285294519.py in <module>
----> 1 main7["Value"] = main7.apply(return_riskscore(CountryA = 'Ukraine', CountryB = 'Russia'))
/var/folders/bl/qq18pm0d3kv24sfpjxbb_d2w0000gn/T/ipykernel_29770/1369879570.py in return_riskscore(CountryA, CountryB)
2 i = CountryA
3 j = CountryB
----> 4 senti_value = main7.at[i,j]
5 return senti_value
6 #for i in range(len(main7)):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
2273 return self.obj.loc[key]
2274
-> 2275 return super().__getitem__(key)
2276
2277 def __setitem__(self, key, value):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
2220
2221 key = self._convert_key(key)
-> 2222 return self.obj._get_value(*key, takeable=self._takeable)
2223
2224 def __setitem__(self, key, value):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in _get_value(self, index, col, takeable)
3577
3578 try:
-> 3579 loc = engine.get_loc(index)
3580 return series._values[loc]
3581 except AttributeError:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: 'Ukraine'
More specifically, I want to be able to pull country A's and country B's value.
Is there another way to achieve this? an example output is the value (number) at the specified Row , Country title. For example if Country A = Ukraine and Country B = Russia, then the output I expect is 1.0 Data Frame
I'm sorry that I'm a begginer, so this might be something simple but I don't know what might be wrong.
import pandas as pd
from pandas_datareader import data as wb
tickers = ['F','MSFT','BP']
new_data = pd.DataFrame()
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
This is the error I get:
ValueError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value)
3524 try:
-> 3525 value = Series(value)
3526 except (ValueError, NotImplementedError, TypeError):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
312
--> 313 data = SingleBlockManager(data, index, fastpath=True)
314
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, block, axis, do_integrity_check, fastpath)
1515 if not isinstance(block, Block):
-> 1516 block = make_block(block, placement=slice(0, len(axis)), ndim=1)
1517
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype, fastpath)
3266
-> 3267 return klass(values, ndim=ndim, placement=placement)
3268
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
2774
-> 2775 super().__init__(values, ndim=ndim, placement=placement)
2776
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
127 "Wrong number of items passed {val}, placement implies "
--> 128 "{mgr}".format(val=len(self.values), mgr=len(self.mgr_locs))
129 )
ValueError: Wrong number of items passed 6, placement implies 1318
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-11-6f48653a5e0e> in <module>
2 new_data = pd.DataFrame()
3 for t in tickers:
----> 4 new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3470 else:
3471 # set column
-> 3472 self._set_item(key, value)
3473
3474 def _setitem_slice(self, key, value):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3546 """
3547
-> 3548 self._ensure_valid_index(value)
3549 value = self._sanitize_column(key, value)
3550 NDFrame._set_item(self, key, value)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value)
3526 except (ValueError, NotImplementedError, TypeError):
3527 raise ValueError(
-> 3528 "Cannot set a frame with no defined index "
3529 "and a value that cannot be converted to a "
3530 "Series"
ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series
I imagine this has got something to do with missing information? Reading int as text or something? Can someone help?
This is actually from the answer key of an exercise I am doing from an online course, and it's returning this error.
You can only assign a column with in a dataframe with data of the same size: a column or Series (in Pandas terms). Let's look at what DataReader returns:
wb.DataReader('F', data_source='yahoo', start='2015-1-1').head()
Date High Low Open Close Volume AdjClose
2015-01-02 15.65 15.18 15.59 15.36 24777900.0 11.452041
2015-01-05 15.13 14.69 15.12 14.76 44079700.0 11.004693
2015-01-06 14.90 14.38 14.88 14.62 32981600.0 10.900313
2015-01-07 15.09 14.77 14.78 15.04 26065300.0 11.213456
2015-01-08 15.48 15.23 15.40 15.42 33943400.0 11.496773
It returns a dataframe. If you'd like to pick one column from here - say, Open - you can assign that to your new table.
tickers = ['F','MSFT','BP']
new_data = pd.DataFrame()
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')['Open']
new_data.head()
Date F MSFT BP
2015-01-02 15.59 46.660000 38.209999
2015-01-05 15.12 46.369999 36.590000
2015-01-06 14.88 46.380001 36.009998
2015-01-07 14.78 45.980000 36.000000
2015-01-08 15.40 46.750000 36.430000
You could assign to a dictionary;
import pandas as pd
from pandas_datareader import data as wb
tickers = ['F','MSFT','BP']
new_data = {}
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
print(new_data['F'])
you are trying to assign a dataframe to each column of a dataframe
I am trying to run this line of code:
df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
But for some address and zipcodes the function get_zillow_id() fails. But I want the lambda function to just ignore the error for that particular address and zipcode and continue. How do I do that?
Here is the entire code:
from pyzillow.pyzillow import ZillowWrapper, GetDeepSearchResults, GetUpdatedPropertyDetails
import pandas as pd
import numpy as np
key = "X1-ZWz1gtmiat11xn_7ew1d"
# Create function to get zillow_id
def get_zillow_id(key, address, zipcode):
zillow_data = ZillowWrapper(key)
deep_search_response = zillow_data.get_deep_search_results(address, zipcode)
result = GetDeepSearchResults(deep_search_response)
return result.zillow_id
# Create function to get propery data
def get_property_data(key, address, zipcode):
zillow_data = ZillowWrapper(key)
updated_property_details_response = zillow_data.get_updated_property_details(get_zillow_id(key, address, zipcode))
result = GetUpdatedPropertyDetails(updated_property_details_response)
return result.year_built
# Import data into dataframe
df = pd.read_csv('test.csv')
# Get zillow ids
df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
Here is a picture of the data frame:
Here is the error I am getting:
ZillowError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self, code_obj, result)
2861 #rprint('Running code', repr(code_obj)) # dbg
-> 2862 exec(code_obj, self.user_global_ns, self.user_ns)
2863 finally:
<ipython-input-40-55f38b77eeea> in <module>()
1 # Get zillow ids
----> 2 df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4261 reduce=reduce,
-> 4262 ignore_failures=ignore_failures)
4263 else:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4357 for i, v in enumerate(series_gen):
-> 4358 results[i] = func(v)
4359 keys.append(v.name)
<ipython-input-40-55f38b77eeea> in <lambda>(row)
1 # Get zillow ids
----> 2 df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']), axis = 1)
<ipython-input-37-ce158395fdb8> in get_zillow_id(key, address, zipcode)
3 zillow_data = ZillowWrapper(key)
----> 4 deep_search_response = zillow_data.get_deep_search_results(address, zipcode)
5 result = GetDeepSearchResults(deep_search_response)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyzillow\pyzillow.py in get_deep_search_results(self, address, zipcode)
30 }
---> 31 return self.get_data(url, params)
32
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyzillow\pyzillow.py in get_data(self, url, params)
81 if response.findall('message/code')[0].text is not '0':
---> 82 raise ZillowError(int(response.findall('message/code')[0].text))
83 else:
<class 'str'>: (<class 'TypeError'>, TypeError('__str__ returned non-string (type dict)',))
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self, code_obj, result)
2877 if result is not None:
2878 result.error_in_exec = sys.exc_info()[1]
-> 2879 self.showtraceback(running_compiled_code=True)
2880 else:
2881 outflag = False
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in showtraceback(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)
1809 value, tb, tb_offset=tb_offset)
1810
-> 1811 self._showtraceback(etype, value, stb)
1812 if self.call_pdb:
1813 # drop into debugger
~\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\zmqshell.py in _showtraceback(self, etype, evalue, stb)
541 u'traceback' : stb,
542 u'ename' : unicode_type(etype.__name__),
--> 543 u'evalue' : py3compat.safe_unicode(evalue),
544 }
545
~\AppData\Local\Continuum\anaconda3\lib\site-packages\ipython_genutils\py3compat.py in safe_unicode(e)
63 """
64 try:
---> 65 return unicode_type(e)
66 except UnicodeError:
67 pass
TypeError: __str__ returned non-string (type dict)
You should try and understand exactly why your function will fail. Then use a try / except clause to ignore the specific problem you wish to avoid. For example, to ignore TypeError:
def get_zillow_id(key, address, zipcode):
try:
zillow_data = ZillowWrapper(key)
deep_search_response = zillow_data.get_deep_search_results(address, zipcode)
result = GetDeepSearchResults(deep_search_response)
return result.zillow_id
except TypeError, ZillowError:
return None
df['Zillow ID'] = df.apply(lambda row: get_zillow_id(key, row['Address'], row['Zipcode']),
axis=1)
If ZillowError is an actual error, you may need to import it from that library.
I have a very similar pandas dataframe :
In:
import pandas as pd
df = pd.DataFrame({'blobs':['6-Feb- 1 4 Startup ZestFinance says it has built a machine-learning system that’s smart enough to find new borrowers and keep bias out of its credit analysis. 17-Feb-2014',
'Credit ratings have long been the key measure of how likely a U.S. 29—Oct-2012 consumer is to repay any loan, from mortgages to 18-0ct-12 credit cards. But the factors that FICO and other companies that create credit scores rely on—things like credit history and credit card balances—often depend on having credit already. ',
'November 22, 2012 In recent years, a crop of startup companies have launched on the premise 6—Feb- ] 4 that borrowers without such histories might still be quite likely to repay, and that their likelihood of doing so could be determined by analyzing large amounts of data, especially data that has traditionally not been part of the credit evaluation. These companies use algorithms and machine learning to find meaningful patterns in the data, alternative signs that a borrower is a good or bad credit risk.',
'March 1“, 2012 Los Angeles-based ZestFinance, founded by former Google CIO Douglas Merrill, claims to have solved this problem with a new credit-scoring platform, called ZAML. 06—Fcb—2012 The company sells the machine-learning software to lenders and also offers consulting 19—Jan— ] 2 services. Zest does not lend money itself. January 2, 1990']})
Each row has some noisy dates in different formats. Thus, my objective is to extract the dates into another column, for example, from the above daata frame I would like to extract dates like this:
(*)
Out:
dates1 dates2
0 6-Feb-14, 17-Feb-2014 NaN
1 29—Oct-2012, 18-0ct-12 NaN
2 6—Feb- ]4 November 22, 2012
3 06—Fcb—2012, 19—Jan— ] 2 March 1“, 2012 | January 2, 1990
So far, I tried to define the following regex and extract from the column with extract():
In:
df['col1'] = df['blobs'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$")
df
However I am getting the following error and I would like to have a more robust method. Therefore, which is the best way of getting (*)
Out:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2133 try:
-> 2134 return self._engine.get_loc(key)
2135 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3667 try:
-> 3668 loc = self.items.get_loc(item)
3669 except KeyError:
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-3-ec929aa5341c> in <module>()
----> 1 df['date_format_3'] = df['text'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$", expand = True)
2 df
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in _set_item(self, key, value)
2484 self._ensure_valid_index(value)
2485 value = self._sanitize_column(key, value)
-> 2486 NDFrame._set_item(self, key, value)
2487
2488 # check if we are modifying a copy
/usr/local/lib/python3.5/site-packages/pandas/core/generic.py in _set_item(self, key, value)
1498
1499 def _set_item(self, key, value):
-> 1500 self._data.set(key, value)
1501 self._clear_item_cache()
1502
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3669 except KeyError:
3670 # This item wasn't present, just insert at end
-> 3671 self.insert(len(self.items), item, value)
3672 return
3673
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
3770
3771 block = make_block(values=value, ndim=self.ndim,
-> 3772 placement=slice(loc, loc + 1))
3773
3774 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2683 placement=placement, dtype=dtype)
2684
-> 2685 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2686
2687 # TODO: flexible with index=None and/or items=None
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, ndim, fastpath, placement, **kwargs)
1815
1816 super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath,
-> 1817 placement=placement, **kwargs)
1818
1819 #property
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim, fastpath)
107 raise ValueError('Wrong number of items passed %d, placement '
108 'implies %d' % (len(self.values),
--> 109 len(self.mgr_locs)))
110
111 #property
ValueError: Wrong number of items passed 4, placement implies 1
You have to define a capture group in the str.extract() with parenthesis.
I extended/corrected the regex pattern, but it needs more work.
(You can review and experiment with this regular expression on regex101)
df['blobs'].str.extract("(\d{1,2}[\s-]\w+[\s-]\d{2,4}|\w+[\s-]\d{1,2}[\",\s-]+\d{2,4})", expand = True)
Out:
0
0 17-Feb-2014
1 18-0ct-12
2 November 22, 2012
3 January 2, 1990