Can't read a column after pivoting pandas dataframe - python

I have table that I came from a pivot table to eliminate missing values and too short value like city name, here's my code
company = pd.read_sql('SELECT user_id, address FROM company' , con=db_connection)
table = pd.pivot_table(company, index=['address'],aggfunc=np.sum)
table.reset_index()
Then I get his
address user_id
3 Jl. Raya Kranggan No. 7, Ruko Kav V No. 1 Jat... 65132
4 #ALAMAT atau LOKASI\r\nKota bengkulu perhubung... 15570
5 '--!>'</script/><Svg/Onload=confirm`alamat bis... 48721
6 (Rumah Bpk.RA'IS) Jl.Puskesmas RT.004/11 No.29... 20786
It seems OK when I check the colums
table.columns
Index(['user_id', 'address'], dtype='object')
Then I can't call a column
table['address']
When I call that column, this is happen
KeyError Traceback (most recent call last)
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2392 try:
-> 2393 return self._engine.get_loc(key)
2394 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5239)()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5085)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20405)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20359)()
KeyError: 'address'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-46-eef3b78ea5fd> in <module>()
----> 1 table['address'] #.astype(str)
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2060 return self._getitem_multilevel(key)
2061 else:
-> 2062 return self._getitem_column(key)
2063
2064 def _getitem_column(self, key):
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2067 # get column
2068 if self.columns.is_unique:
-> 2069 return self._get_item_cache(key)
2070
2071 # duplicate columns & possible reduce dimensionality
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1532 res = cache.get(item)
1533 if res is None:
-> 1534 values = self._data.get(item)
1535 res = self._box_item_values(item, values)
1536 cache[item] = res
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\asus\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2393 return self._engine.get_loc(key)
2394 except KeyError:
-> 2395 return self._engine.get_loc(self._maybe_cast_indexer(key))
2396
2397 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5239)()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5085)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20405)()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20359)()
KeyError: 'address'
If you have other solution, it's fine as long as I can map the address to do keyword mapping

I think you need asign output of reset_index back, because address is index name, no column:
table = pd.pivot_table(company, index='address',aggfunc=np.sum).reset_index()
Another solutions if want define columns for aggregate sum:
table = company.groupby('address', as_index=False)['user_id'].sum()
Or:
table = company.groupby('address')['user_id'].sum().reset_index()
And for all columns:
table = company.groupby('address', as_index=False).sum()
table = company.groupby('address').sum().reset_index()

I don't think pivot is a proper choice here.
You can use this:
company.groupby('address').sum()

Related

Why I can't create two columns with different moving average windows on Pandas?

I'm sincerely out of clue with this one. I've been trying to create a couple of columns from a dataframe but I get the ValueError: Wrong number of items passed 2, placement implies 1 . Somehow I can create one, doesn't matter if it is the window=7 or the window=14 but only allowed to create one. Here's my code:
import pandas as pd
from datetime import datetime, timedelta
suspects_url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_suspects_mx.csv'
suspects = pd.read_csv(suspects_url, index_col=0)
suspects = suspects.loc['Colima']
suspects = pd.DataFrame(suspects)
suspects.index = pd.to_datetime(suspects.index, format='%d-%m-%Y')
suspects['suspects_ma_7'] = suspects.rolling(window=7).mean()
suspects['suspects_ma_14'] = suspects.rolling(window=14).mean()
suspects.columns = ['suspects','suspects_ma_7','suspects_ma_14']
suspects
And this is the error I am getting:
KeyError Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2645 try:
-> 2646 return self._engine.get_loc(key)
2647 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'suspects_ma_14'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/internals/managers.py in set(self, item, value)
1070 try:
-> 1071 loc = self.items.get_loc(item)
1072 except KeyError:
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key))
2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'suspects_ma_14'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-6-4138f1be9342> in <module>
5 suspects.index = pd.to_datetime(suspects.index, format='%d-%m-%Y')
6 suspects['suspects_ma_7'] = suspects.rolling(window=7).mean()
----> 7 suspects['suspects_ma_14'] = suspects.rolling(window=14).mean()
8 suspects.columns = ['suspects','suspects_ma_7','suspects_ma_14']
9
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2936 else:
2937 # set column
-> 2938 self._set_item(key, value)
2939
2940 def _setitem_slice(self, key, value):
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/frame.py in _set_item(self, key, value)
2999 self._ensure_valid_index(value)
3000 value = self._sanitize_column(key, value)
-> 3001 NDFrame._set_item(self, key, value)
3002
3003 # check if we are modifying a copy
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/generic.py in _set_item(self, key, value)
3622
3623 def _set_item(self, key, value) -> None:
-> 3624 self._data.set(key, value)
3625 self._clear_item_cache()
3626
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/internals/managers.py in set(self, item, value)
1072 except KeyError:
1073 # This item wasn't present, just insert at end
-> 1074 self.insert(len(self.items), item, value)
1075 return
1076
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/internals/managers.py in insert(self, loc, item, value, allow_duplicates)
1179 new_axis = self.items.insert(loc, item)
1180
-> 1181 block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))
1182
1183 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/internals/blocks.py in make_block(values, placement, klass, ndim, dtype)
3045 values = DatetimeArray._simple_new(values, dtype=dtype)
3046
-> 3047 return klass(values, ndim=ndim, placement=placement)
3048
3049
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/internals/blocks.py in __init__(self, values, placement, ndim)
122
123 if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
--> 124 raise ValueError(
125 f"Wrong number of items passed {len(self.values)}, "
126 f"placement implies {len(self.mgr_locs)}"
ValueError: Wrong number of items passed 2, placement implies 1
How can I solve this issue?
I insist, at my attempts with only one suspects_ma_# it works. But when I'm trying to create both, I just get the error.
When you first run suspects['suspects_ma_7'] = suspects.rolling(window=7).mean() you automatically transform your Series into a DataFrame.
So, for running the second rolling approach, use:
suspects['suspects_ma_7'] = suspects.Colima.rolling(window=7).mean()
Note the "suspects.Colima" in the code above.

Selecting rows with a string index that contains a bracket

My table review_cp is indexed on beer names. I got the top three beer names through the following code.
top_3_spacy = review_cp.groupby('Name')['Average Evaluation Score'].mean().sort_values(by='Average Evaluation Score', ascending = False).index[:3].tolist()
The results are ['Rodenbach Caractère Rouge', 'Dorothy (Wine Barrel Aged)', 'Doubleganger']
However, when I tried to select rows using review_cp.loc[top_3_spacy[0]], it gave me a key error.
KeyError Traceback (most recent call
last) ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in
get_loc(self, key, method, tolerance) 2896 try:
-> 2897 return self._engine.get_loc(key) 2898 except KeyError:
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index_class_helper.pxi in
pandas._libs.index.Int64Engine._check_type()
KeyError: 'Rodenbach Caractère Rouge'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call
last) in
----> 1 review_cp.loc[top_3_spacy[0]]
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
getitem(self, key) 1422 1423 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1424 return self._getitem_axis(maybe_callable, axis=axis) 1425 1426 def _is_scalar_access(self, key:
Tuple):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_axis(self, key, axis) 1848 # fall thru to straight lookup 1849 self._validate_key(key, axis)
-> 1850 return self._get_label(key, axis=axis) 1851 1852
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_get_label(self, label, axis)
158 raise IndexingError("no slices here, handle elsewhere")
159
--> 160 return self.obj._xs(label, axis=axis)
161
162 def _get_loc(self, key: int, axis: int):
~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key,
axis, level, drop_level) 3735 loc, new_index =
self.index.get_loc_level(key, drop_level=drop_level) 3736
else:
-> 3737 loc = self.index.get_loc(key) 3738 3739 if isinstance(loc, np.ndarray):
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in
get_loc(self, key, method, tolerance) 2897 return
self._engine.get_loc(key) 2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key)) 2900
indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas_libs\index_class_helper.pxi in
pandas._libs.index.Int64Engine._check_type()
KeyError: 'Rodenbach Caractère Rouge'
I tried another method using review_cp[review_cp['Name'].str.contains(top_3_spacy[0])], it worked for 'Rodenbach Caractère Rouge' and 'Doubleganger', but not for 'Dorothy (Wine Barrel Aged)'. I wonder if it was because of the bracket?
I doubt the issue is due to the bracket, as it is part of the string. As long as the string matches a name in the "Name" column, there shouldn't be a problem. If you want to get the rows of your top 3 list, instead of using loc, you can use:
review_cp[review_cp['Name'].isin(top_3_spacy)]
That will isolate your top3 names (and it should include Dorothy).
I guess, the problem is, that Name is not in your index. Otherwise, your groupby statement would not be able to access the value. So your index is most likely an automatic integer index.
But .loc expects values in the index and can't find the name in the index of your DataFrame.
You can resolve this by using an indexer like in the post of DDD1.
review_cp['Name'].isin(top_3_spacy)
Creates this indexer to select the rows with the names in the list.

Making a column of boolean values based on two conditions in pandas dataframe

I'm trying to makea column of boolean values based on if one column has the word 'hazard' and does not contain the word 'roof' (thus I get all non-roof hazards).
I'm using the below code and I'm getting an error:
labels['h_count2'] = labels[(labels['Description'].str.contains('Hazard')) & (labels['Description'].str.contains('Roof'))]
This is the traceback:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 try:
-> 2657 return self._engine.get_loc(key)
2658 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'h_count2'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\internals\managers.py in set(self, item, value)
1052 try:
-> 1053 loc = self.items.get_loc(item)
1054 except KeyError:
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2658 except KeyError:
-> 2659 return self._engine.get_loc(self._maybe_cast_indexer(key))
2660 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'h_count2'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-46-51360ea6f27f> in <module>
1 labels['h_count'] = labels['Description'].str.contains('Roof Hazard')
2 labels['b_count'] = labels['Description'].str.contains('Brush')
----> 3 labels['h_count2'] = labels[(labels['Description'].str.contains('Hazard')) & (labels['Description'].str.contains('Roof'))]
4
5 def target(row):
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3368 else:
3369 # set column
-> 3370 self._set_item(key, value)
3371
3372 def _setitem_slice(self, key, value):
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3444 self._ensure_valid_index(value)
3445 value = self._sanitize_column(key, value)
-> 3446 NDFrame._set_item(self, key, value)
3447
3448 # check if we are modifying a copy
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\generic.py in _set_item(self, key, value)
3170
3171 def _set_item(self, key, value):
-> 3172 self._data.set(key, value)
3173 self._clear_item_cache()
3174
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\internals\managers.py in set(self, item, value)
1054 except KeyError:
1055 # This item wasn't present, just insert at end
-> 1056 self.insert(len(self.items), item, value)
1057 return
1058
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\internals\managers.py in insert(self, loc, item, value, allow_duplicates)
1156
1157 block = make_block(values=value, ndim=self.ndim,
-> 1158 placement=slice(loc, loc + 1))
1159
1160 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype, fastpath)
3093 values = DatetimeArray._simple_new(values, dtype=dtype)
3094
-> 3095 return klass(values, ndim=ndim, placement=placement)
3096
3097
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
2629
2630 super(ObjectBlock, self).__init__(values, ndim=ndim,
-> 2631 placement=placement)
2632
2633 #property
C:\ProgramData\Anaconda3\envs\tensorflowenvironment\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
85 raise ValueError(
86 'Wrong number of items passed {val}, placement implies '
---> 87 '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
88
89 def _check_ndim(self, values, ndim):
ValueError: Wrong number of items passed 5, placement implies 1
What am i doing wrong?
labels:
A Description
0 1 Roof
1 2 Hazard
2 3 Roof Hazard
labels['h_count2'] = labels.Description.str.contains('Hazard') & ~labels.Description.str.contains('Roof')
Results in
A Description h_count2
0 1 Roof False
1 2 Hazard True
2 3 Roof Hazard False
labels = pd.DataFrame({'Description': ['Hazard Roof test', 'test', 'Hazard is not', 'test2']})
labels['h_count2'] = (labels['Description'].str.upper().str.contains('HAZARD')) & ~(labels['Description'].str.upper().str.contains('ROOF'))
Description h_count2
0 Hazard Roof test False
1 test False
2 Hazard is not True
3 test2 False

Dummy Variable Column Head not found?

I am attempting logistic regression for a class, and I got stuck on something very strange, I haven't seen this error before. I am a complete python noob so I'd appreciate being very specific in details. Please dumb it down for me!
Here is my code:
import pandas as pd
import matplotlib.pyplot as plt
cancer=pd.read_csv('C:\\Users\\kpic1\\Desktop\\Spring 2019\\IDIS 450\\Second exam\\Provided documents\\Cancer_Research_Q2.csv')
cancer.set_index('PatientID', inplace=True)
cancer=pd.get_dummies(cancer, columns=['Class'], drop_first=True)
cancer.head()
from sklearn.model_selection import train_test_split
cancer.rename(columns={'Class_Malignant':'Malignant'}, inplace=True)
cancer.head()
The table that is returned shows the dummy variable on the very end still being named "Class_Malignant."
When I try to print just this column using
print(cancer['Class_Malignant'])
I get the following long error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3077 try:
-> 3078 return self._engine.get_loc(key)
3079 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Class_Malignant'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-15-09d1356e98ac> in <module>
1 #for some reason, "Class_Malignant" is not registering as a column head?
----> 2 print(cancer['Class_Malignant'])
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
2487 res = cache.get(item)
2488 if res is None:
-> 2489 values = self._data.get(item)
2490 res = self._box_item_values(item, values)
2491 cache[item] = res
~\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3078 return self._engine.get_loc(key)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Class_Malignant'
From what I can tell from this, the column name "Class_Malignant" does not exist in the data set? Anyone have any ideas what's going on?

When I applied (NLTK) stop words to a data frame it showing an error?

Reviews Label
0 Bromwell High is a cartoon comedy. It ran at t... Positive
1 Homelessness (or Houselessness as George Carli... Positive
2 Brilliant over-acting by Lesley Ann Warren. Be... Positive
The above one is my data frame with columns: Reviews and Label When I excecuted the code below : `
nltk.download('stopwords') This is used to update stop words.
from nltk.corpus import stopwords
stop = stopwords.words('english')
final_without_stopwords = final[['Reviews','Label']].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])).str.replace('[^\w\s]','')
print(final_without_stopwords)`
Result:
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3077 try:
-> 3078 return self._engine.get_loc(key)
3079 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: ('Reviews', 'Label')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-52-cb4ca290db84> in <module>()
5 #final['Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
6
----> 7 final_without_stopwords = final['Reviews','Label'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])).str.replace('[^\w\s]','')
8 print(final_without_stopwords)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
2487 res = cache.get(item)
2488 if res is None:
-> 2489 values = self._data.get(item)
2490 res = self._box_item_values(item, values)
2491 cache[item] = res
~\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3078 return self._engine.get_loc(key)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: ('Reviews', 'Label')
enter code here
**
Actually I want to apply stop words to my data frame which only has two columns. When I excecuted this code with single column (Reviews) it worked well
but when I excecuted with two columns (Reviews & Label) it is showing
some error. Any suggestions how to handle this code with both columns.
**
If you want to apply a function elementwise to the dataframe, use applymap:
A simplified example:
import pandas as pd
stop = set(['a','the','i','is'])
df = pd.DataFrame( {'sentence1':['i am a boy','i am a girl'],
'sentence2':['Bromwell High is a cartoon comedy','i am a girl']})
df[['sentence1','sentence2']].applymap(lambda x: ' '.join(i for i in x.split() if i not in stop))
sentence1 sentence2
0 am boy Bromwell High cartoon comedy
1 am girl am girl
If you want to reassign the values without stopwords into your dataframe, use:
df[['sentence1','sentence2']] = df[['sentence1','sentence2']].applymap(lambda x: ' '.join(i for i in x.split() if i not in stop))

Categories

Resources