I have a DataFrame (df) with many columns and rows.
What I'd like to do is access the values in one column for which the values in two other columns match my indexer.
This is what my code looks like now:
df.loc[df.delays == curr_d, df.prev_delay == prev_d, 'd_stim']
In case it isn't clear, my goal is to select the values in the column 'd_stim' for which other values in the same row are curr_d (in the 'delays' column) and prev_d (in the 'prev_delay' column).
This use of loc does not work. It raises the following error:
/home/despo/dbliss/dopa_net/behavioral_experiments/analysis_code/behavior_analysis.py in plot_prev_curr_interaction(data_frames, labels)
2061 for k, prev_d in enumerate(delays):
2062 diff = np.array(df.loc[df.delays == curr_d,
-> 2063 df.prev_delay == prev_d, 'd_stim'])
2064 ind = ~np.isnan(diff)
2065 diff_rad = np.deg2rad(diff[ind])
/usr/local/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1292
1293 if type(key) is tuple:
-> 1294 return self._getitem_tuple(key)
1295 else:
1296 return self._getitem_axis(key, axis=0)
/usr/local/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
787
788 # no multi-index, so validate all of the indexers
--> 789 self._has_valid_tuple(tup)
790
791 # ugly hack for GH #836
/usr/local/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
139 for i, k in enumerate(key):
140 if i >= self.obj.ndim:
--> 141 raise IndexingError('Too many indexers')
142 if not self._has_valid_type(k, i):
143 raise ValueError("Location based indexing can only have [%s] "
IndexingError: Too many indexers
What is the appropriate way to access the data I need?
your logic isn't working for two reasons.
pandas doesn't know what to do with comma separated conditions
df.delays == curr_d, df.prev_delay == prev_d
Assuming you meant and you need to wrap these up in parenthesis and join with &. This is #MaxU's solution in the comments and should work unless you haven't given us everything.
df.loc[(df.delays == curr_d) & (df.prev_delay == prev_d), 'd_stim'])
However, I think this looks prettier.
df.query('delays == #curr_d and prev_delay == #prev_d').d_stim
If this works then so should've #MaxU's. If neither work, I suggest you post some sample data because most folk don't like guessing what your data is.
Related
I am new to python and I can't figure out why I get this error: ValueError: Incompatible indexer with Series.
I am trying to add a date to my data frame.
The date I am trying to add:
date = (chec[(chec['Día_Sem']=='Thursday') & (chec['ID']==2011957)]['Entrada'])
date
Date output:
56 1900-01-01 07:34:00
Name: Entrada, dtype: datetime64[ns]
Then I try to add 'date' to my data frame using loc:
rep.loc[2039838,'Thursday'] = date
rep
And I get this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-347-3e0678b0fdbf> in <module>
----> 1 rep.loc[2039838,'Thursday'] = date
2 rep
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
188 key = com.apply_if_callable(key, self.obj)
189 indexer = self._get_setitem_indexer(key)
--> 190 self._setitem_with_indexer(indexer, value)
191
192 def _validate_key(self, key, axis):
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value)
640 # setting for extensionarrays that store dicts. Need to decide
641 # if it's worth supporting that.
--> 642 value = self._align_series(indexer, Series(value))
643
644 elif isinstance(value, ABCDataFrame):
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
781 return ser.reindex(ax)._values
782
--> 783 raise ValueError('Incompatible indexer with Series')
784
785 def _align_frame(self, indexer, df):
ValueError: Incompatible indexer with Series
I was also facing similar issue but in a different scenario. I came across threads of duplicate indices, but of-course that was not the case with me. What worked for me was to use .at in place of .loc. So you can try and see if it works
rep['Thursday'].at[2039838] = date.values[0]
Try date.iloc[0] instead of date:
rep.loc[2039838,'Thursday'] = date.iloc[0]
Because date is actually a Series (so basically like a list/array) of the values, and .iloc[0] actually selects the value.
You use loc to get a specific value, and your date type is a series or dataframe, the type between the two can not match, you can change the code to give the value of date to rep.loc[2039838,'Thursday'], for example, if your date type is a series and is not null, you can do this:
rep.loc[2039838,'Thursday'] = date.values[0]
I have the following problem
I have a DF with a season variable, that have have used hot encode on,
so I now have 4 Boolean columns with 1's and 0's, that were used to make a model from some known good data, I now need to use this model to find the correct season in some bad data
so I built a simple test case to try to had code summer
def season_model(row1):
row1 = row1.iloc[:]
row1.loc[:,'Summer'] =1
row1.loc[:,'Winter'] =0
row1.loc[:,'Spring'] =0
row1.loc[:,'Autumn'] =0
predictions = model.predict(row1)
cur_pred= predictions[0][0]
return cur_pred
this worked when I manually subset a row like shown below
row1 = prediction_data[3:4]
row1 =row1.iloc[:,:-1]
However when I try to do so using the apply() function on a data frame like below:
oos_df['s_predictions'] = oos_df[["Summer", "Winter", "Spring","Autumn"]].apply(lambda x: season_model(x),axis=1)
I run in to the following error, I have been trying to resolve this for a while but keep coming up blank
<ipython-input-254-241c900a588c> in season_model(row1)
5 # for season in season_encode:
6 #encode = season_encode[season]
----> 7 row1.loc[:,'Summer'] =1
8 row1.loc[:,'Winter'] =0
9 row1.loc[:,'Spring'] =0
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
667 else:
668 key = com.apply_if_callable(key, self.obj)
--> 669 indexer = self._get_setitem_indexer(key)
670 self._setitem_with_indexer(indexer, value)
671
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_setitem_indexer(self, key)
660 if "cannot do" in str(e):
661 raise
--> 662 raise IndexingError(key)
663
664 def __setitem__(self, key, value):
IndexingError: (slice(None, None, None), 'Summer')
When you do:
oos_df[["Summer", "Winter", "Spring","Autumn"]].apply(lambda x: season_model(x),axis=1)
You are sending the row values of ood_df columns to your function season_model where x represent a row and you apply it on each of the column (axis=1) (in your case Summer, Winter, Spring and Authm).
Take a look at your function - when you get a row1 argument (which is ["Summer", "Winter", "Spring","Autumn"]) you have only a single data (row) so there's no need to do row1 = row1.iloc[:].
When you do this:
row1 = prediction_data[3:4]
row1 = row1.iloc[:,:-1]
This will work because row1 holds one row then row1 holds the last element of the column. when you send it to a function with apply and lambda - you're saying: hey function - here's a row from my df I want you to do something with it.
Put it differently, use this function below and see the result and then you can rebuild 'sesson_model' correctly.
def season_model(row1):
print (row1)
oos_df[["Summer", "Winter", "Spring","Autumn"]].apply(lambda x: season_model(x),axis=1)
I have this function:
def same_price(df=df):
df= df.sort_values(by='Ticket')
nucleus= dict()
k=0
while df.shape[0]>=2:
if df.Price.iloc[0]==df.Price.iloc[1]:
value= df.Price.iloc[0]
n=0
nucleus[k]= []
while df.Price.iloc[n]==value:
nucleus[k].append(df.index[n])
n+=1
if n>df.shape[0]:
df.drop(nucleus[k], axis=0, inplace=True)
break
else:
df.drop(nucleus[k], axis=0, inplace=True)
k+=1
else:
if df.shape[0]>=3:
df.drop(df.index[0], axis=0, inplace=True)
else:
break
return(nucleus)
The objective of the function is to go through the ordered dataframe, and list together the persons who paid the same price GIVEN the sequence of the 'Ticket'id. (I do not just want to list together ALL the people who paid the same price, no matter the sequence!)
The dataframe:
Price Ticket
Id
521 93.5000 12749
821 93.5000 12749
584 40.1250 13049
648 35.5000 13213
633 30.5000 13214
276 77.9583 13502
628 77.9583 13502
766 77.9583 13502
435 55.9000 13507
578 55.9000 13507
457 26.5500 13509
588 79.2000 13567
540 49.5000 13568
48 7.7500 14311
574 7.7500 14312
369 7.7500 14313
When I test it:
same_price(df[:11])is working just fine and the output is : {0: [521, 821], 1: [276, 628, 766], 2: [435, 578]}
but, same_fare(df[:10]) throws:IndexError: single positional indexer is out-of-bounds.
I'd like to know what is wrong with this function guys.
Thx
I found what's wrong, if anyone is interested...
df.iloc[n] gets the (n+1)th line of the dataframe. But shape[0]=n means that the dataframe has n elements.
Hence we use if n+1>df.shape[0]:, instead of if n>df.shape[0]:
Cheers :)
I checked similar questions posted about slicing DFs in Python but they didn't explain the inconsistency I'm seeing in my exercise.
The code works with the known diamonds data frame. Top lines of the data frame are:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
I have to create a slicing function which takes 4 arguments: DataFrame 'df', a column of that DataFrame
'col', the label of another column 'label' and two values 'val1' and 'val2'. The function will take the frame and output the entries of the column indicated by the 'label' argument for which the rows of the column 'col' are greater than the number 'val1' and less than 'val2'.
The following stand-alone piece of code gives me the correct answer:
diamonds.loc[(diamonds.carat > 1.1) & (diamonds.carat < 1.4),['price']]
and I get the price from the rows where the carat value is between 1.1 and 1.4.
However, when I try to use this syntax in a function, it doesn't work and I get an error.
Function:
def slice2(df,col,output_label,val1,val2):
res = df.loc[(col > val1) & (col < val2), ['output_label']]
return res
Function call:
slice2(diamonds,diamonds.carat,'price',1.1,1.4)
Error:
"None of [['output_label']] are in the [columns]"
Full traceback message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-64-adc582faf6cc> in <module>()
----> 1 exercise2(test_df,test_df.carat,'price',1.1,1.4)
<ipython-input-63-556b71ba172d> in exercise2(df, col, output_label, val1, val2)
1 def exercise2(df,col,output_label,val1,val2):
----> 2 res = df.loc[(col > val1) & (col < val2), ['output_label']]
3 return res
/Users/jojo/Library/Enthought/Canopy/edm/envs/User/lib/python3.5/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1323 except (KeyError, IndexError):
1324 pass
-> 1325 return self._getitem_tuple(key)
1326 else:
1327 key = com._apply_if_callable(key, self.obj)
/Users/jojo/Library/Enthought/Canopy/edm/envs/User/lib/python3.5/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
839
840 # no multi-index, so validate all of the indexers
--> 841 self._has_valid_tuple(tup)
842
843 # ugly hack for GH #836
/Users/jojo/Library/Enthought/Canopy/edm/envs/User/lib/python3.5/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
187 if i >= self.obj.ndim:
188 raise IndexingError('Too many indexers')
--> 189 if not self._has_valid_type(k, i):
190 raise ValueError("Location based indexing can only have [%s] "
191 "types" % self._valid_types)
/Users/jojo/Library/Enthought/Canopy/edm/envs/User/lib/python3.5/site-packages/pandas/core/indexing.py in _has_valid_type(self, key, axis)
1416
1417 raise KeyError("None of [%s] are in the [%s]" %
-> 1418 (key, self.obj._get_axis_name(axis)))
1419
1420 return True
KeyError: "None of [['output_label']] are in the [columns]"
I'm not very advanced in Python and after looking at this code for a while I haven't been able to figure out what the problem is. Maybe I'm blind to something obvious here and would appreciate any pointed on how to get the function to work or how to redo it so that it gives the same result as the single line code.
Thanks
In your function
def slice2(df,col,output_label,val1,val2):
res = df.loc[(col > val1) & (col < val2), ['output_label']]
return res
you are searching for the column with name 'output_label' instead of using your parameter (you are assigning its value directly instead of using your value!)
This should work:
def slice2(df,col,output_label,val1,val2):
res = df.loc[(col > val1) & (col < val2), [output_label]] # notice that there are not quotes
return res
I'm working with twitter data related to two different keywords.I want to count the number of tweets per day for each keyword, visualise the results on a line graph, then overlay that with the dates of a range of significant calendar events.
My aim is to see whether tweet counts change around specific events. I've already counted and visualised the tweets, but am having problems figuring out how to overlay key dates.
I tried putting the important dates into a list, but it threw an error. Could anyone give me some pointers or suggest a better way to approach this?
Here's an image that gives a rough idea of what I'm trying to achieve:
https://imgur.com/a/36esk1B
dates_list = ['2016-06-16','2016-06-23', '2016-06-24',
'2016-07-02', '2016-07-13']
#then convert list into a Series
key_dates = pd.Series(pd.to_datetime(dates_list))
# add columns to identify important events, and mark a 0 or 1.
tweet_trend['Important Events'] = False
tweet_trend.loc[key_dates, 'Important Events'] = True
tweet_trend['values'] = 0
tweet_trend.loc[key_dates, 'values'] = 1
KeyError Traceback (most recent call last)
<ipython-input-88-04dd081adc28> in <module>
10 # add columns to identify important events, and mark a 0 or 1.
11 tweet_trend['Important Events'] = False
---> 12 tweet_trend.loc[key_dates, 'Important Events'] = True
13 tweet_trend['values'] = 0
14 tweet_trend.loc[key_dates, 'values'] = 1
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
187 else:
188 key = com.apply_if_callable(key, self.obj)
--> 189 indexer = self._get_setitem_indexer(key)
190 self._setitem_with_indexer(indexer, value)
191
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _get_setitem_indexer(self, key)
165 if isinstance(key, tuple):
166 try:
--> 167 return self._convert_tuple(key, is_setter=True)
168 except IndexingError:
169 pass
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_tuple(self, key, is_setter)
246 if i >= self.obj.ndim:
247 raise IndexingError('Too many indexers')
--> 248 idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter)
249 keyidx.append(idx)
250 return tuple(keyidx)
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
1352 kwargs = {'raise_missing': True if is_setter else
1353 raise_missing}
-> 1354 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
1355 else:
1356 try:
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1159 self._validate_read_indexer(keyarr, indexer,
1160 o._get_axis_number(axis),
-> 1161 raise_missing=raise_missing)
1162 return keyarr, indexer
1163
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1250 if not(self.name == 'loc' and not raise_missing):
1251 not_found = list(set(key) - set(ax))
-> 1252 raise KeyError("{} not in index".format(not_found))
1253
1254 # we skip the warning on Categorical/Interval
KeyError: "[Timestamp('2016-06-16 00:00:00')] not in index"
You can use Index.isin for test membership and then casting column to integer for True/False to 1/0 mapping, also converting to Series is not necessary:
dates_list = ['2016-06-16','2016-06-23', '2016-06-24',
'2016-07-02', '2016-07-13']
key_dates = pd.to_datetime(dates_list)
tweet_trend['Important Events'] = df.index.isin(key_dates)
tweet_trend['values'] = tweet_trend['Important Events'].astype(int)