Filtering by condition in Python dataset - python

I'm struggling with a sorting operation of a stata file in Phyton3: I was asked to keep only the households without kids out of a dataset/table:
I used a filtering condition to filter these rows out of the table:
filtering_condition = df["kids"] > 0
df_nokids = df.loc[filtering_condition,"kids"]
This, however, gives me an unknown error:
KeyError Traceback (most recent call last)
/opt/anaconda/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
1944 try:
-> 1945 return self._engine.get_loc(key)
1946 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()
KeyError: 'kids'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-321-e72cd8a67065> in <module>()
1 #keep only the households without kids and use this dataset for the rest of the assignment
----> 2 filtering_condition = df["kids"] > 0
3 df_nokids = df.loc[filtering_condition,"kids"]
/opt/anaconda/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
1995 return self._getitem_multilevel(key)
1996 else:
-> 1997 return self._getitem_column(key)
1998
1999 def _getitem_column(self, key):
/opt/anaconda/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2002 # get column
2003 if self.columns.is_unique:
-> 2004 return self._get_item_cache(key)
2005
2006 # duplicate columns & possible reduce dimensionality
/opt/anaconda/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1348 res = cache.get(item)
1349 if res is None:
-> 1350 values = self._data.get(item)
1351 res = self._box_item_values(item, values)
1352 cache[item] = res
/opt/anaconda/anaconda3/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3288
3289 if not isnull(item):
-> 3290 loc = self.items.get_loc(item)
3291 else:
3292 indexer = np.arange(len(self.items)) [isnull(self.items)]
/opt/anaconda/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
1945 return self._engine.get_loc(key)
1946 except KeyError:
-> 1947 return self._engine.get_loc(self._maybe_cast_indexer(key))
1948
1949 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()
KeyError: 'kids'
Any explanations of what I am doing wrong?
Thanks!
Datafile:

Do you mean something like this:
df_kids = df[df['kids']>0]
This selects the rows where the 'kids' column is not zero.

Related

Selecting rows with a string index that contains a bracket

My table review_cp is indexed on beer names. I got the top three beer names through the following code.
top_3_spacy = review_cp.groupby('Name')['Average Evaluation Score'].mean().sort_values(by='Average Evaluation Score', ascending = False).index[:3].tolist()
The results are ['Rodenbach Caractère Rouge', 'Dorothy (Wine Barrel Aged)', 'Doubleganger']
However, when I tried to select rows using review_cp.loc[top_3_spacy[0]], it gave me a key error.
KeyError Traceback (most recent call
last) ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in
get_loc(self, key, method, tolerance) 2896 try:
-> 2897 return self._engine.get_loc(key) 2898 except KeyError:
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index_class_helper.pxi in
pandas._libs.index.Int64Engine._check_type()
KeyError: 'Rodenbach Caractère Rouge'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call
last) in
----> 1 review_cp.loc[top_3_spacy[0]]
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
getitem(self, key) 1422 1423 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1424 return self._getitem_axis(maybe_callable, axis=axis) 1425 1426 def _is_scalar_access(self, key:
Tuple):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_axis(self, key, axis) 1848 # fall thru to straight lookup 1849 self._validate_key(key, axis)
-> 1850 return self._get_label(key, axis=axis) 1851 1852
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_get_label(self, label, axis)
158 raise IndexingError("no slices here, handle elsewhere")
159
--> 160 return self.obj._xs(label, axis=axis)
161
162 def _get_loc(self, key: int, axis: int):
~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key,
axis, level, drop_level) 3735 loc, new_index =
self.index.get_loc_level(key, drop_level=drop_level) 3736
else:
-> 3737 loc = self.index.get_loc(key) 3738 3739 if isinstance(loc, np.ndarray):
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in
get_loc(self, key, method, tolerance) 2897 return
self._engine.get_loc(key) 2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key)) 2900
indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index_class_helper.pxi in
pandas._libs.index.Int64Engine._check_type()
KeyError: 'Rodenbach Caractère Rouge'
I tried another method using review_cp[review_cp['Name'].str.contains(top_3_spacy[0])], it worked for 'Rodenbach Caractère Rouge' and 'Doubleganger', but not for 'Dorothy (Wine Barrel Aged)'. I wonder if it was because of the bracket?
I doubt the issue is due to the bracket, as it is part of the string. As long as the string matches a name in the "Name" column, there shouldn't be a problem. If you want to get the rows of your top 3 list, instead of using loc, you can use:
review_cp[review_cp['Name'].isin(top_3_spacy)]
That will isolate your top3 names (and it should include Dorothy).
I guess, the problem is, that Name is not in your index. Otherwise, your groupby statement would not be able to access the value. So your index is most likely an automatic integer index.
But .loc expects values in the index and can't find the name in the index of your DataFrame.
You can resolve this by using an indexer like in the post of DDD1.
review_cp['Name'].isin(top_3_spacy)
Creates this indexer to select the rows with the names in the list.

reshaping the list to write select statement

I have a dataframe in this format:
and I want to select the rows that have specific class based on previous output of other operations, the problem is that the previous output is formatted like
most=["['books']"], so when I tried to write the select statement I wrote it like this:
df.loc[str([df['class'][0]])==most[0]]
However, I got this error:
~\Anaconda3\mianaconda\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
1789 if not ax.contains(key):
-> 1790 error()
1791 except TypeError as e:
~\Anaconda3\mianaconda\lib\site-packages\pandas\core\indexing.py in error()
1784 .format(key=key,
-> 1785 axis=self.obj._get_axis_name(axis)))
1786
KeyError: 'the label [True] is not in the [index]'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-166-604a94f3536a> in <module>
----> 1 df.loc[str([df['class'][0]])==most[0]]
~\Anaconda3\mianaconda\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1476
1477 maybe_callable = com._apply_if_callable(key, self.obj)
-> 1478 return self._getitem_axis(maybe_callable, axis=axis)
1479
1480 def _is_scalar_access(self, key):
~\Anaconda3\mianaconda\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1909
1910 # fall thru to straight lookup
-> 1911 self._validate_key(key, axis)
1912 return self._get_label(key, axis=axis)
1913
~\Anaconda3\mianaconda\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
1796 raise
1797 except:
-> 1798 error()
1799
1800 def _is_scalar_access(self, key):
~\Anaconda3\mianaconda\lib\site-packages\pandas\core\indexing.py in error()
1783 raise KeyError(u"the label [{key}] is not in the [{axis}]"
1784 .format(key=key,
-> 1785 axis=self.obj._get_axis_name(axis)))
1786
1787 try:
KeyError: 'the label [True] is not in the [index]'
How can I solve this?
Try this:
df.loc[df['class'] == eval(most[0])[0]]
your string must be converted to an array, and then looked up to get the string in the array which is a string.

Can't read a column after pivoting pandas dataframe

I have table that I came from a pivot table to eliminate missing values and too short value like city name, here's my code
company = pd.read_sql('SELECT user_id, address FROM company' , con=db_connection)
table = pd.pivot_table(company, index=['address'],aggfunc=np.sum)
table.reset_index()
Then I get his
address user_id
3 Jl. Raya Kranggan No. 7, Ruko Kav V No. 1 Jat... 65132
4 #ALAMAT atau LOKASI\r\nKota bengkulu perhubung... 15570
5 '--!>'</script/><Svg/Onload=confirm`alamat bis... 48721
6 (Rumah Bpk.RA'IS) Jl.Puskesmas RT.004/11 No.29... 20786
It seems OK when I check the colums
table.columns
Index(['user_id', 'address'], dtype='object')
Then I can't call a column
table['address']
When I call that column, this is happen
KeyError Traceback (most recent call last)
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2392 try:
-> 2393 return self._engine.get_loc(key)
2394 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5239)()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5085)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20405)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20359)()
KeyError: 'address'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-46-eef3b78ea5fd> in <module>()
----> 1 table['address'] #.astype(str)
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2060 return self._getitem_multilevel(key)
2061 else:
-> 2062 return self._getitem_column(key)
2063
2064 def _getitem_column(self, key):
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2067 # get column
2068 if self.columns.is_unique:
-> 2069 return self._get_item_cache(key)
2070
2071 # duplicate columns & possible reduce dimensionality
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1532 res = cache.get(item)
1533 if res is None:
-> 1534 values = self._data.get(item)
1535 res = self._box_item_values(item, values)
1536 cache[item] = res
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2393 return self._engine.get_loc(key)
2394 except KeyError:
-> 2395 return self._engine.get_loc(self._maybe_cast_indexer(key))
2396
2397 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5239)()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5085)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20405)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20359)()
KeyError: 'address'
If you have other solution, it's fine as long as I can map the address to do keyword mapping
I think you need asign output of reset_index back, because address is index name, no column:
table = pd.pivot_table(company, index='address',aggfunc=np.sum).reset_index()
Another solutions if want define columns for aggregate sum:
table = company.groupby('address', as_index=False)['user_id'].sum()
Or:
table = company.groupby('address')['user_id'].sum().reset_index()
And for all columns:
table = company.groupby('address', as_index=False).sum()
table = company.groupby('address').sum().reset_index()
I don't think pivot is a proper choice here.
You can use this:
company.groupby('address').sum()

Problems while trying to extract dates from a pandas column?

I have a very similar pandas dataframe :
In:
import pandas as pd
df = pd.DataFrame({'blobs':['6-Feb- 1 4 Startup ZestFinance says it has built a machine-learning system that’s smart enough to find new borrowers and keep bias out of its credit analysis. 17-Feb-2014',
'Credit ratings have long been the key measure of how likely a U.S. 29—Oct-2012 consumer is to repay any loan, from mortgages to 18-0ct-12 credit cards. But the factors that FICO and other companies that create credit scores rely on—things like credit history and credit card balances—often depend on having credit already. ',
'November 22, 2012 In recent years, a crop of startup companies have launched on the premise 6—Feb- ] 4 that borrowers without such histories might still be quite likely to repay, and that their likelihood of doing so could be determined by analyzing large amounts of data, especially data that has traditionally not been part of the credit evaluation. These companies use algorithms and machine learning to find meaningful patterns in the data, alternative signs that a borrower is a good or bad credit risk.',
'March 1“, 2012 Los Angeles-based ZestFinance, founded by former Google CIO Douglas Merrill, claims to have solved this problem with a new credit-scoring platform, called ZAML. 06—Fcb—2012 The company sells the machine-learning software to lenders and also offers consulting 19—Jan— ] 2 services. Zest does not lend money itself. January 2, 1990']})
Each row has some noisy dates in different formats. Thus, my objective is to extract the dates into another column, for example, from the above daata frame I would like to extract dates like this:
(*)
Out:
dates1 dates2
0 6-Feb-14, 17-Feb-2014 NaN
1 29—Oct-2012, 18-0ct-12 NaN
2 6—Feb- ]4 November 22, 2012
3 06—Fcb—2012, 19—Jan— ] 2 March 1“, 2012 | January 2, 1990
So far, I tried to define the following regex and extract from the column with extract():
In:
df['col1'] = df['blobs'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$")
df
However I am getting the following error and I would like to have a more robust method. Therefore, which is the best way of getting (*)
Out:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2133 try:
-> 2134 return self._engine.get_loc(key)
2135 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3667 try:
-> 3668 loc = self.items.get_loc(item)
3669 except KeyError:
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-3-ec929aa5341c> in <module>()
----> 1 df['date_format_3'] = df['text'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$", expand = True)
2 df
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in _set_item(self, key, value)
2484 self._ensure_valid_index(value)
2485 value = self._sanitize_column(key, value)
-> 2486 NDFrame._set_item(self, key, value)
2487
2488 # check if we are modifying a copy
/usr/local/lib/python3.5/site-packages/pandas/core/generic.py in _set_item(self, key, value)
1498
1499 def _set_item(self, key, value):
-> 1500 self._data.set(key, value)
1501 self._clear_item_cache()
1502
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3669 except KeyError:
3670 # This item wasn't present, just insert at end
-> 3671 self.insert(len(self.items), item, value)
3672 return
3673
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
3770
3771 block = make_block(values=value, ndim=self.ndim,
-> 3772 placement=slice(loc, loc + 1))
3773
3774 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2683 placement=placement, dtype=dtype)
2684
-> 2685 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2686
2687 # TODO: flexible with index=None and/or items=None
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, ndim, fastpath, placement, **kwargs)
1815
1816 super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath,
-> 1817 placement=placement, **kwargs)
1818
1819 #property
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim, fastpath)
107 raise ValueError('Wrong number of items passed %d, placement '
108 'implies %d' % (len(self.values),
--> 109 len(self.mgr_locs)))
110
111 #property
ValueError: Wrong number of items passed 4, placement implies 1
You have to define a capture group in the str.extract() with parenthesis.
I extended/corrected the regex pattern, but it needs more work.
(You can review and experiment with this regular expression on regex101)
df['blobs'].str.extract("(\d{1,2}[\s-]\w+[\s-]\d{2,4}|\w+[\s-]\d{1,2}[\",\s-]+\d{2,4})", expand = True)
Out:
0
0 17-Feb-2014
1 18-0ct-12
2 November 22, 2012
3 January 2, 1990

Get keyerror accessing row by index in 16.0 Pandas dataframe in Python 3.4

Why do I keep getting a key error?
[edit] Here is the data:
GEO,LAT,LON
AALBORG DENMARK,57.0482206,9.9193939
AARHUS DENMARK,56.1496278,10.2134046
ABBOTSFORD BC CANADA,49.0519047,-122.3290473
ABEOKUTA NIGERIA,7.161,3.348
ABERDEEN SCOTLAND,57.1452452,-2.0913745
[end edit]
Can't find row by index, but its clearly there:
geocache = pd.read_csv('geolog.csv',index_col=['GEO']) # index_col=['GEO']
geocache.head()
Shows
LAT LON
GEO
AALBORG DENMARK 57.048221 9.919394
AARHUS DENMARK 56.149628 10.213405
ABBOTSFORD BC CANADA 49.051905 -122.329047
ABEOKUTA NIGERIA 7.161000 3.348000
ABERDEEN SCOTLAND 57.145245 -2.091374
So then I test it:
x = 'AARHUS DENMARK'
print(x)
geocache[x]
And this is what I get:
AARHUS DENMARK
KeyError Traceback (most recent call last)
in ()
2 x = u'AARHUS DENMARK'
3 print(x)
----> 4 geocache[x]
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1785 return self._getitem_multilevel(key)
1786 else:
-> 1787 return self._getitem_column(key)
1788
1789 def _getitem_column(self, key):
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1792 # get column
1793 if self.columns.is_unique:
-> 1794 return self._get_item_cache(key)
1795
1796 # duplicate columns & possible reduce dimensionaility
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1077 res = cache.get(item)
1078 if res is None:
-> 1079 values = self._data.get(item)
1080 res = self._box_item_values(item, values)
1081 cache[item] = res
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
2841
2842 if not isnull(item):
-> 2843 loc = self.items.get_loc(item)
2844 else:
2845 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\g\Anaconda3\lib\site-packages\pandas\core\index.py in get_loc(self, key, method)
1435 """
1436 if method is None:
-> 1437 return self._engine.get_loc(_values_from_object(key))
1438
1439 indexer = self.get_indexer([key], method=method)
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3824)()
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3704)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12349)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12300)()
KeyError: 'AARHUS DENMARK'
No extra spaces or non-visible chars, Tried putting r and u before the string assignment with no change in behavior.
Ok, what am I missing?
As you didn't pass a sep (separator) arg to read_csv the default is comma separated. As your csv contained spaces/tabs after the commas then these get treated as part of the data hence your index data contains embedded spaces.
So you need to pass additional params to read_csv:
pd.read_csv('geolog.csv',index_col=['GEO'], sep=',\s+', engine='python')
The sep arg means that it will look for commas with optional 1 or more spaces in front of the commas, we pass engine='python' as the c engine does not accept a regex for separators.

Categories

Resources