How to solve this problem in Python Importing Stocks data? - python

I'm sorry that I'm a begginer, so this might be something simple but I don't know what might be wrong.
import pandas as pd
from pandas_datareader import data as wb
tickers = ['F','MSFT','BP']
new_data = pd.DataFrame()
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
This is the error I get:
ValueError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value)
3524 try:
-> 3525 value = Series(value)
3526 except (ValueError, NotImplementedError, TypeError):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
312
--> 313 data = SingleBlockManager(data, index, fastpath=True)
314
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, block, axis, do_integrity_check, fastpath)
1515 if not isinstance(block, Block):
-> 1516 block = make_block(block, placement=slice(0, len(axis)), ndim=1)
1517
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype, fastpath)
3266
-> 3267 return klass(values, ndim=ndim, placement=placement)
3268
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
2774
-> 2775 super().__init__(values, ndim=ndim, placement=placement)
2776
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
127 "Wrong number of items passed {val}, placement implies "
--> 128 "{mgr}".format(val=len(self.values), mgr=len(self.mgr_locs))
129 )
ValueError: Wrong number of items passed 6, placement implies 1318
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-11-6f48653a5e0e> in <module>
2 new_data = pd.DataFrame()
3 for t in tickers:
----> 4 new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3470 else:
3471 # set column
-> 3472 self._set_item(key, value)
3473
3474 def _setitem_slice(self, key, value):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3546 """
3547
-> 3548 self._ensure_valid_index(value)
3549 value = self._sanitize_column(key, value)
3550 NDFrame._set_item(self, key, value)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value)
3526 except (ValueError, NotImplementedError, TypeError):
3527 raise ValueError(
-> 3528 "Cannot set a frame with no defined index "
3529 "and a value that cannot be converted to a "
3530 "Series"
ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series
I imagine this has got something to do with missing information? Reading int as text or something? Can someone help?
This is actually from the answer key of an exercise I am doing from an online course, and it's returning this error.

You can only assign a column with in a dataframe with data of the same size: a column or Series (in Pandas terms). Let's look at what DataReader returns:
wb.DataReader('F', data_source='yahoo', start='2015-1-1').head()
Date High Low Open Close Volume AdjClose
2015-01-02 15.65 15.18 15.59 15.36 24777900.0 11.452041
2015-01-05 15.13 14.69 15.12 14.76 44079700.0 11.004693
2015-01-06 14.90 14.38 14.88 14.62 32981600.0 10.900313
2015-01-07 15.09 14.77 14.78 15.04 26065300.0 11.213456
2015-01-08 15.48 15.23 15.40 15.42 33943400.0 11.496773
It returns a dataframe. If you'd like to pick one column from here - say, Open - you can assign that to your new table.
tickers = ['F','MSFT','BP']
new_data = pd.DataFrame()
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')['Open']
new_data.head()
Date F MSFT BP
2015-01-02 15.59 46.660000 38.209999
2015-01-05 15.12 46.369999 36.590000
2015-01-06 14.88 46.380001 36.009998
2015-01-07 14.78 45.980000 36.000000
2015-01-08 15.40 46.750000 36.430000

You could assign to a dictionary;
import pandas as pd
from pandas_datareader import data as wb
tickers = ['F','MSFT','BP']
new_data = {}
for t in tickers:
new_data[t] = wb.DataReader(t, data_source='yahoo', start='2015-1-1')
print(new_data['F'])
you are trying to assign a dataframe to each column of a dataframe

Related

How do I create a function that pulls two values from my dataframe?

I want to create a function that basically pulls two specific values or "cells" from my data frame. The data frame is a little large so I attached a picture of it. My current code for the function
def return_riskscore(CountryA, CountryB):
i = CountryA
j = CountryB
senti_value = main7.at[i,j]
return senti_value
Then I used the following to apply the function to the dataframe.
main7["Value"] = main7.apply(return_riskscore(CountryA = 'Ukraine ', CountryB = 'Russia'))
The error I receive is as follows:
KeyError Traceback (most recent call last)
/var/folders/bl/qq18pm0d3kv24sfpjxbb_d2w0000gn/T/ipykernel_29770/4285294519.py in <module>
----> 1 main7["Value"] = main7.apply(return_riskscore(CountryA = 'Ukraine', CountryB = 'Russia'))
/var/folders/bl/qq18pm0d3kv24sfpjxbb_d2w0000gn/T/ipykernel_29770/1369879570.py in return_riskscore(CountryA, CountryB)
2 i = CountryA
3 j = CountryB
----> 4 senti_value = main7.at[i,j]
5 return senti_value
6 #for i in range(len(main7)):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
2273 return self.obj.loc[key]
2274
-> 2275 return super().__getitem__(key)
2276
2277 def __setitem__(self, key, value):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
2220
2221 key = self._convert_key(key)
-> 2222 return self.obj._get_value(*key, takeable=self._takeable)
2223
2224 def __setitem__(self, key, value):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in _get_value(self, index, col, takeable)
3577
3578 try:
-> 3579 loc = engine.get_loc(index)
3580 return series._values[loc]
3581 except AttributeError:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: 'Ukraine'
More specifically, I want to be able to pull country A's and country B's value.
Is there another way to achieve this? an example output is the value (number) at the specified Row , Country title. For example if Country A = Ukraine and Country B = Russia, then the output I expect is 1.0 Data Frame

Error **ValueError: Wrong number of items passed 2, placement implies 1**

Wrong number of items passed 2, placement implies 1
I want to calculate 'overnight returns' & 'Intraday returns' of AAMZN & AAPL Error
df['intradayReturn'] = (df1["adj_close"]/df1["open"])-1
df['overnightReturn'] = (df1["open_shift"]/df1["adj_close"])-1
in python , below is my code :
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import warnings
df = yf.download(['AAPL','AMZN'],start='2006-01-01',end='2020-12-31')**
df.head()
Adj Close Close High Low Open Volume
AAPL AMZN AAPL AMZN AAPL AMZN AAPL AMZN AAPL AMZN AAPL AMZN
Date
2006-01-03 2.299533 47.580002 2.669643 47.580002 2.669643 47.849998 2.580357 46.250000 2.585000 47.470001 807234400 7582200
2006-01-04 2.306301 47.250000 2.677500 47.250000 2.713571 47.730000 2.660714 46.689999 2.683214 47.490002 619603600 7440900
2006-01-05 2.288151 47.650002 2.656429 47.650002 2.675000 48.200001 2.633929 47.110001 2.672500 47.160000 449422400 5417200
2006-01-06 2.347216 47.869999 2.725000 47.869999 2.739286 48.580002 2.662500 47.320000 2.687500 47.970001 704457600 6152900
2006-01-09 2.339525 47.080002 2.716071 47.080002 2.757143 47.099998 2.705000 46.400002 2.740357 46.549999 675040800 8943100**
Upto this it was working fine
But when i used this formula
df['intradayReturn'] = (df["Adj Close"]/df["Open"])-1
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'intradayReturn'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _set_item(self, key, value)
3824 try:
-> 3825 loc = self._info_axis.get_loc(key)
3826 except KeyError:
~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in get_loc(self, key, method)
2875 if not isinstance(key, tuple):
-> 2876 loc = self._get_level_indexer(key, level=0)
2877 return _maybe_to_slice(loc)
~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_indexer(self, key, level, indexer)
3157
-> 3158 idx = self._get_loc_single_level_index(level_index, key)
3159
~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_loc_single_level_index(self, level_index, key)
2808 else:
-> 2809 return level_index.get_loc(key)
2810
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3081 except KeyError as err:
-> 3082 raise KeyError(key) from err
3083
KeyError: 'intradayReturn'
enter code here
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-24-b2a4b46270a9> in <module>
----> 1 df['intradayReturn'] = (df["Adj Close"]/df["Open"])-1
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3161 else:
3162 # set column
-> 3163 self._set_item(key, value)
3164
3165 def _setitem_slice(self, key: slice, value):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3238 self._ensure_valid_index(value)
3239 value = self._sanitize_column(key, value)
-> 3240 NDFrame._set_item(self, key, value)
3241
3242 # check if we are modifying a copy
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _set_item(self, key, value)
3826 except KeyError:
3827 # This item wasn't present, just insert at end
-> 3828 self._mgr.insert(len(self._info_axis), key, value)
3829 return
3830
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in insert(self, loc, item, value, allow_duplicates)
1201 value = safe_reshape(value, (1,) + value.shape)
1202
-> 1203 block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))
1204
1205 for blkno, count in _fast_count_smallints(self.blknos[loc:]):
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype)
2730 values = DatetimeArray._simple_new(values, dtype=dtype)
2731
-> 2732 return klass(values, ndim=ndim, placement=placement)
2733
2734
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
141 if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
142 raise ValueError(
--> 143 f"Wrong number of items passed {len(self.values)}, "
144 f"placement implies {len(self.mgr_locs)}"
145 )
its showing this error
ValueError: Wrong number of items passed 2, placement implies 1
It's EZ PZ you can easily do this by the code below.
gf = (df['Adj Close']['AAPL']/df['Open']['AAPL']-1).to_frame()
sd = (df['Adj Close']['AMZN']/df['Open']['AMZN']-1).to_frame()
df['intradayReturn', 'AAPL'] = gf
df['intradayReturn', 'AMZN'] = sd
df.head()
so your output is like:
you need to assign separately when it comes to multi-index columns.
now it's your turn for overnight return 😉.

KeyError: date value

I got a csv file with the following columns:
Province/State Country/Region Lat Long 1/22/20 1/23/20 ...
This is the dataframe:
Country/Region 1/22/20 1/23/20 1/24/20
Afghanistan 100 200 300
Albania 400 500 0
Algeria 20 30 70
I'm trying to get a function with inputs: csv-file and last date I want to filter.
Here I show the function I did:
def create_covid_pickle (csv_doc, date):
csv_doc = pd.read_csv(csv_doc)
# assign correct format to date
date = datetime.datetime.strptime(date,"%m-%d-%y")
date = date.strftime("%m/%d/%y")
# delete columns I don't need
csv_doc = csv_doc.loc[:, 'Country/Region': date]
csv_doc = csv_doc.drop(columns = ['Lat', 'Long'])
# to_dict
# csv_dictionary = csv_doc.to_dict()
csv_dictionary = [{c: {'time': d.columns.tolist(), 'cases': d.values.tolist()[0]}}
for c, d in csv_doc.set_index(['Country/Region']).groupby('Country/Region')]
return csv_dictionary
I am getting 2 errors in csv_doc = csv_doc.loc[:, 'Country/Region': date] The first, when using .loc[] and the second because of date date
The complete message of error shown is
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
4844 try:
-> 4845 return self._searchsorted_monotonic(label, side)
4846 except ValueError:
~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py in _searchsorted_monotonic(self, label, side)
4805
-> 4806 raise ValueError("index must be monotonic increasing or decreasing")
4807
ValueError: index must be monotonic increasing or decreasing
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-2-a84a2fe01741> in <module>
1 # Test
----> 2 create_covid_pickle("data/time_series_covid19_confirmed_global.csv", "01-06-20")
3
4 # Load and print some data
5 # country_cases = pickle.load(open("primera_ola.pkl", "rb"))
<ipython-input-1-1c34e9c20811> in create_covid_pickle(csv_doc, date)
29
30 # seleccionar columnas - crear funcion 1 indepte?
---> 31 csv_doc = csv_doc.loc[:, 'Country/Region': date]
32
33 # REVISAR FUNCION CON NUEVOS CAMBIOS DE LAS LINEAS ANTERIORES
~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1287 continue
1288
-> 1289 retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
1290
1291 return retval
~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1910 if isinstance(key, slice):
1911 self._validate_key(key, axis)
-> 1912 return self._get_slice_axis(key, axis=axis)
1913 elif com.is_bool_indexer(key):
1914 return self._getbool_axis(key, axis=axis)
~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
1794
1795 labels = obj._get_axis(axis)
-> 1796 indexer = labels.slice_indexer(
1797 slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name
1798 )
~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
4711 slice(1, 3)
4712 """
-> 4713 start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
4714
4715 # return a slice
~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
4930 end_slice = None
4931 if end is not None:
-> 4932 end_slice = self.get_slice_bound(end, "right", kind)
4933 if end_slice is None:
4934 end_slice = len(self)
~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
4846 except ValueError:
4847 # raise the original KeyError
-> 4848 raise err
4849
4850 if isinstance(slc, np.ndarray):
~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
4840 # we need to look up the label
4841 try:
-> 4842 slc = self.get_loc(label)
4843 except KeyError as err:
4844 try:
~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2646 return self._engine.get_loc(key)
2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key))
2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2650 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: '01/06/20'
I thought that the lines referred to datetime should avoid this problem, but it doesn't.
How can I solve it?
Thank you
date.strftime("%m/%d/%y") returns 01/31/20, whereas the same column in your dataframe is labelled 1/31/20, thus the mismatch.
I would suggest that you try this:
def create_covid_pickle (csv_doc, date):
csv_doc = pd.read_csv(csv_doc)
# properly format csv_doc columns
csv_doc.columns = [
datetime.datetime.strptime(col, "%m/%d/%y").strftime("%m/%d/%y")
if col.replace("/", "").isnumeric()
else col
for col in csv_doc.columns
]
# assign correct format to date
date = datetime.datetime.strptime(date,"%m-%d-%y")
date = date.strftime("%m/%d/%y")
# Sort columns
csv_doc = csv_doc.sort_index(axis=1, ascending=False)
# delete columns I don't need
csv_doc = csv_doc.loc[:, 'Country/Region': date]
...

Copying CSV files to XLSX with formats

Here is the snippet of my code.
import pandas as pd
import os
dpath = '//xxx//HEM'
for filename in os.listdir('//xxx//HEM'):
df = pd.read_csv(dpath + '/' + filename)
df = df['ab':'af'] #select required columns based on your requirement.
df["ab"] = pd.to_numeric(df["ab"]) # convert datatype of the column based on your need
df["af"] = pd.to_numeric(df["af"]) # convert datatype of the column based on your need
df1.append(df)
del df
df1.to_excel('test.xlsx')
On each CSV sheet in the folder i am reading from, Column AB & AF should be numeric values. I get the following error.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-c4c944eb874e> in <module>
4 for filename in os.listdir('//xxx//HEM'):
5 df = pd.read_csv(dpath + '/' + filename)
----> 6 df = df['ab':'af'] #select required columns based on your requirement.
7 df["ab"] = pd.to_numeric(df["ab"]) # convert datatype of the column based on your need
8 df1.append(df)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2906
2907 # Do we have a slicer (on rows)?
-> 2908 indexer = convert_to_index_sliceable(self, key)
2909 if indexer is not None:
2910 return self._slice(indexer, axis=0)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in convert_to_index_sliceable(obj, key)
2454 idx = obj.index
2455 if isinstance(key, slice):
-> 2456 return idx._convert_slice_indexer(key, kind='getitem')
2457
2458 elif isinstance(key, compat.string_types):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _convert_slice_indexer(self, key, kind)
2926 """
2927 if self.is_integer() or is_index_slice:
-> 2928 return slice(self._validate_indexer('slice', key.start, kind),
2929 self._validate_indexer('slice', key.stop, kind),
2930 self._validate_indexer('slice', key.step, kind))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _validate_indexer(self, form, key, kind)
4708 pass
4709 elif kind in ['iloc', 'getitem']:
-> 4710 self._invalid_indexer(form, key)
4711 return key
4712
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _invalid_indexer(self, form, key)
3065 "indexers [{key}] of {kind}".format(
3066 form=form, klass=type(self), key=key,
-> 3067 kind=type(key)))
3068
3069 # --------------------------------------------------------------------
TypeError: cannot do slice indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [ab] of <class 'str'>
Is there something i am doing wrong ? I am guessing its the format of the Data ?

Problems while trying to extract dates from a pandas column?

I have a very similar pandas dataframe :
In:
import pandas as pd
df = pd.DataFrame({'blobs':['6-Feb- 1 4 Startup ZestFinance says it has built a machine-learning system that’s smart enough to find new borrowers and keep bias out of its credit analysis. 17-Feb-2014',
'Credit ratings have long been the key measure of how likely a U.S. 29—Oct-2012 consumer is to repay any loan, from mortgages to 18-0ct-12 credit cards. But the factors that FICO and other companies that create credit scores rely on—things like credit history and credit card balances—often depend on having credit already. ',
'November 22, 2012 In recent years, a crop of startup companies have launched on the premise 6—Feb- ] 4 that borrowers without such histories might still be quite likely to repay, and that their likelihood of doing so could be determined by analyzing large amounts of data, especially data that has traditionally not been part of the credit evaluation. These companies use algorithms and machine learning to find meaningful patterns in the data, alternative signs that a borrower is a good or bad credit risk.',
'March 1“, 2012 Los Angeles-based ZestFinance, founded by former Google CIO Douglas Merrill, claims to have solved this problem with a new credit-scoring platform, called ZAML. 06—Fcb—2012 The company sells the machine-learning software to lenders and also offers consulting 19—Jan— ] 2 services. Zest does not lend money itself. January 2, 1990']})
Each row has some noisy dates in different formats. Thus, my objective is to extract the dates into another column, for example, from the above daata frame I would like to extract dates like this:
(*)
Out:
dates1 dates2
0 6-Feb-14, 17-Feb-2014 NaN
1 29—Oct-2012, 18-0ct-12 NaN
2 6—Feb- ]4 November 22, 2012
3 06—Fcb—2012, 19—Jan— ] 2 March 1“, 2012 | January 2, 1990
So far, I tried to define the following regex and extract from the column with extract():
In:
df['col1'] = df['blobs'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$")
df
However I am getting the following error and I would like to have a more robust method. Therefore, which is the best way of getting (*)
Out:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2133 try:
-> 2134 return self._engine.get_loc(key)
2135 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3667 try:
-> 3668 loc = self.items.get_loc(item)
3669 except KeyError:
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-3-ec929aa5341c> in <module>()
----> 1 df['date_format_3'] = df['text'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$", expand = True)
2 df
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in _set_item(self, key, value)
2484 self._ensure_valid_index(value)
2485 value = self._sanitize_column(key, value)
-> 2486 NDFrame._set_item(self, key, value)
2487
2488 # check if we are modifying a copy
/usr/local/lib/python3.5/site-packages/pandas/core/generic.py in _set_item(self, key, value)
1498
1499 def _set_item(self, key, value):
-> 1500 self._data.set(key, value)
1501 self._clear_item_cache()
1502
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3669 except KeyError:
3670 # This item wasn't present, just insert at end
-> 3671 self.insert(len(self.items), item, value)
3672 return
3673
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
3770
3771 block = make_block(values=value, ndim=self.ndim,
-> 3772 placement=slice(loc, loc + 1))
3773
3774 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2683 placement=placement, dtype=dtype)
2684
-> 2685 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2686
2687 # TODO: flexible with index=None and/or items=None
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, ndim, fastpath, placement, **kwargs)
1815
1816 super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath,
-> 1817 placement=placement, **kwargs)
1818
1819 #property
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim, fastpath)
107 raise ValueError('Wrong number of items passed %d, placement '
108 'implies %d' % (len(self.values),
--> 109 len(self.mgr_locs)))
110
111 #property
ValueError: Wrong number of items passed 4, placement implies 1
You have to define a capture group in the str.extract() with parenthesis.
I extended/corrected the regex pattern, but it needs more work.
(You can review and experiment with this regular expression on regex101)
df['blobs'].str.extract("(\d{1,2}[\s-]\w+[\s-]\d{2,4}|\w+[\s-]\d{1,2}[\",\s-]+\d{2,4})", expand = True)
Out:
0
0 17-Feb-2014
1 18-0ct-12
2 November 22, 2012
3 January 2, 1990

Categories

Resources