pandas group by remove outliers - python

I want to remove outliers based on percentile 99 values by group wise.
import pandas as pd
df = pd.DataFrame({'Group': ['A','A','A','B','B','B','B'], 'count': [1.1,11.2,1.1,3.3,3.40,3.3,100.0]})
in output i want to remove 11.2 from group A and 100 from group b. so in final dataset there will only be 5 observations.
wantdf = pd.DataFrame({'Group': ['A','A','B','B','B'], 'count': [1.1,1.1,3.3,3.40,3.3]})
I have tried this one but I'm not getting the desired results
df[df.groupby("Group")['count'].transform(lambda x : (x<x.quantile(0.99))&(x>(x.quantile(0.01)))).eq(1)]

Here is my solution:
def is_outlier(s):
lower_limit = s.mean() - (s.std() * 3)
upper_limit = s.mean() + (s.std() * 3)
return ~s.between(lower_limit, upper_limit)
df = df[~df.groupby('Group')['count'].apply(is_outlier)]
You can write your own is_outlier function

I don't think you want to use quantile, as you'll exclude your lower values:
import pandas as pd
df = pd.DataFrame({'Group': ['A','A','A','B','B','B','B'], 'count': [1.1,11.2,1.1,3.3,3.40,3.3,100.0]})
print(pd.DataFrame(df.groupby('Group').quantile(.01)['count']))
output:
count
Group
A 1.1
B 3.3
Those aren't outliers, right? So you wouldn't want to exclude them.
You could try setting left and right limits by using standard deviations from the median maybe? This is a bit verbose, but it gives you the right answer:
left = pd.DataFrame(df.groupby('Group').median() - pd.DataFrame(df.groupby('Group').std()))
right = pd.DataFrame(df.groupby('Group').median() + pd.DataFrame(df.groupby('Group').std()))
left.columns = ['left']
right.columns = ['right']
df = df.merge(left, left_on='Group', right_index=True)
df = df.merge(right, left_on='Group', right_index=True)
df = df[(df['count'] > df['left']) & (df['count'] < df['right'])]
df = df.drop(['left', 'right'], axis=1)
print(df)
output:
Group count
0 A 1.1
2 A 1.1
3 B 3.3
4 B 3.4
5 B 3.3

Related

Pandas custom exponential decay

Suppose I have the following dataframe.
df = pd.DataFrame({"a": [1, 0, 0, 2, 0]})
I want to construct a new dataframe based on df such that
newdf[0] = 1 or nan
newdf[1] = 0 + newdf[0] * exp(-alpha) # Alpha is some value.
newdf[2] = 0 + newdf[1] * exp(-alpha)
newdf[3] = 2 + newdf[2] * exp(-alpha)
newdf[4] = 0 + newdf[3] * exp(-alpha)
Basically I want to construct a new dataframe which accepts instanteneous change and decay its own value.
Is there an elegant way to achieve this using pd.rolling or pd.ewm?
I'd like to avoid any for-loop because dataframe has many rows and columns.
Thanks
Use -
alpha = 2
df['new'] = 1 or np.nan
df['new'] = df['a'] + df['a'].shift(-1)*np.exp(-alpha)
import numpy as np is a dependency.
The last row in the df will by np.nan based on this.

The best way to find the win percentage for a pandas DataFrame with win, loss columns

I have a pandas DataFrame with two columns ('win' and 'loss') and I want to find the win percentage ('win%') and pass it into the DataFrame. The thing is, for some rows, the entries are 0, so for those rows, I need to pass np.nan into 'win%'.
The following code does the job:
df=pd.DataFrame([[1,2],[0,0],[2,1],[0,1]],columns=['win','loss'])
df['total'] = df['win'] + df['loss']
x=[]
for i in range(df.shape[0]):
if df['total'].iloc[i] > 0:
x.append(df['win'].iloc[i] / df['total'].iloc[i])
else:
x.append(np.nan)
df['win%'] = x
Therefore, the desired outcome is:
win loss win%
0 1 2 0.333333
1 0 0 NaN
2 2 1 0.666667
3 0 1 0.000000
I was wondering if there is a more efficient (pandas-y) way to do it. Also, I don't want to add an unnecessary column ('total') if I don't have to. Any help is appreciated.
You can set all the zero values to np.nan first (using replace), because:
np.nan / np.nan = np.nan
And:
np.nan + np.nan = np.nan
So:
df = pd.DataFrame(
[[1,2],[0,0],[2,1]],columns=['win','loss']
).replace(0, np.nan)
df["win%"] = df["win"] / (df['win'] + df['loss'])

Creating a new feature column on grouped data in a Pandas dataframe

I have a Pandas dataframe with the columns ['week', 'price_per_unit', 'total_units']. I wish to create a new column called 'weighted_price' as follows: first group by 'week' and then for each week calculate price_per_unit * total_units / sum(total_units) for that week. I have code that does this:
import pandas as pd
import numpy as np
def create_features_by_group(df):
# first group data
grouped = df.groupby(['week'])
df_temp = pd.DataFrame(columns=['weighted_price'])
# run through the groups and create the weighted_price per group
for name, group in grouped:
res = (group['total_units'] * group['price_per_unit']) / np.sum(group['total_units'])
for idx in res.index:
df_temp.loc[idx] = [res[idx]]
df.join(df_temp['weighted_price'])
return df
The only problem is that this is very, very slow. Is there some faster way to do this?
I used the following code to test the function.
import pandas as pd
import numpy as np
df = pd.DataFrame(columns=['week', 'price_per_unit', 'total_units'])
for i in range(10):
df.loc[i] = [round(int(i % 3), 0) , 10 * np.random.rand(), round(10 * np.random.rand(), 0)]
I think you need to do it this way:
df
price total_units week
0 5 100 1
1 7 200 1
2 9 150 2
3 11 250 2
4 13 125 2
def fun(table):
table['measure'] = table['price'] * (table['total_units'] / table['total_units'].sum())
return table
df.groupby('week').apply(fun)
price total_units week measure
0 5 100 1 1.666667
1 7 200 1 4.666667
2 9 150 2 2.571429
3 11 250 2 5.238095
4 13 125 2 3.095238
I have grouped the dataset by 'Week' to calculate the weighted price for each week.
Then I joined the original dataset with the grouped dataset to get the result:
# importing the libraries
import pandas as pd
import numpy as np
# creating the dataset
df = {
'Week' : [1,1,1,1,2,2],
'price_per_unit' : [10,11,22,12,12,45],
'total_units' : [10,10,10,10,10,10]
}
df = pd.DataFrame(df)
df['price'] = df['price_per_unit'] * df['total_units']
# calculate the total sales and total number of units sold in each week
df_grouped_week = df.groupby(by = 'Week').agg({'price' : 'sum', 'total_units' : 'sum'}).reset_index()
# calculate the weighted price
df_grouped_week['wt_price'] = df_grouped_week['price'] / df_grouped_week['total_units']
# merging df and df_grouped_week
df_final = pd.merge(df, df_grouped_week[['Week', 'wt_price']], how = 'left', on = 'Week')

add a different random number to every cell in a pandas dataframe

I need to add some 'noise' to my data, so I would like to add a different random number to every cell in my pandas dataframe. This code works, but seems unpythonic. Is there a better way?
import pandas as pd
import numpy as np
df = pd.DataFrame(0.0, index=[1,2,3,4,5], columns=list('ABC') )
print df
for x,line in df.iterrows():
for col in df:
line[col] = line[col] + (np.random.rand()-0.5)/1000.0
print df
df + np.random.rand(*df.shape) / 10000.0
OR
Let's use applymap:
df = pd.DataFrame(1.0, index=[1,2,3,4,5], columns=list('ABC') )
df.applymap(lambda x: x + np.random.rand()/10000.0)
output:
A \
1 [[1.00006953418, 1.00009164785, 1.00003177706]...
2 [[1.00007291245, 1.00004186046, 1.00006935173]...
3 [[1.00000490127, 1.0000633115, 1.00004117181],...
4 [[1.00007159622, 1.0000559506, 1.00007038891],...
5 [[1.00000980335, 1.00004760836, 1.00004214422]...
B \
1 [[1.00000320322, 1.00006981682, 1.00008912557]...
2 [[1.00007443802, 1.00009270815, 1.00007225764]...
3 [[1.00001371778, 1.00001512412, 1.00007986851]...
4 [[1.00005883343, 1.00007936509, 1.00009523334]...
5 [[1.00009329606, 1.00003174878, 1.00006187704]...
C
1 [[1.00005894836, 1.00006592776, 1.0000171843],...
2 [[1.00009085391, 1.00006606979, 1.00001755092]...
3 [[1.00009736701, 1.00007240762, 1.00004558753]...
4 [[1.00003981393, 1.00007505714, 1.00007209959]...
5 [[1.0000031608, 1.00009372917, 1.00001960112],...
This would be the more succinct method and equivalent:
In [147]:
df = pd.DataFrame((np.random.rand(5,3) - 0.5)/1000.0, columns=list('ABC'))
df
Out[147]:
A B C
0 0.000381 -0.000167 0.000020
1 0.000482 0.000007 -0.000281
2 -0.000032 -0.000402 -0.000251
3 -0.000037 -0.000319 0.000260
4 -0.000035 0.000178 0.000166
If you're doing this to an existing df with non-zero values then add:
In [149]:
df = pd.DataFrame(np.random.randn(5,3), columns=list('ABC'))
df
Out[149]:
A B C
0 -1.705644 0.149067 0.835378
1 -0.956335 -0.586120 0.212981
2 0.550727 -0.401768 1.421064
3 0.348885 0.879210 0.136858
4 0.271063 0.132579 1.233789
In [154]:
df.add((np.random.rand(df.shape[0], df.shape[1]) - 0.5)/1000.0)
Out[154]:
A B C
0 -1.705459 0.148671 0.835761
1 -0.956745 -0.586382 0.213339
2 0.550368 -0.401651 1.421515
3 0.348938 0.878923 0.136914
4 0.270864 0.132864 1.233622
For nonzero data:
df + (np.random.rand(df.shape)-0.5)*0.001
OR
df + np.random.uniform(-0.01,0.01,(df.shape)))
For cases where your data frame contains zeros that you wish to keep as zero:
df * (1 + (np.random.rand(df.shape)-0.5)*0.001)
OR
df * (1 + np.random.uniform(-0.01,0.01,(df.shape)))
I think either of these should work, its a case of generating a same size "dataframe" (or perhaps array of arrays) as your existing df and adding it to your existing df (multiplying by 1 + random for cases where you wish zeros to remain zero). With the uniform function you can determine the scale of your noise by altering the 0.01 variable.

How to plot stacked & normalized histograms?

I have a dataset that maps continuous values to discrete categories. I want to display a histogram with the continuous values as x and categories as y, where bars are stacked and normalized. Example:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
df = pd.DataFrame({
'score' : np.random.rand(1000),
'category' : np.random.choice(list('ABCD'), 1000)
},
columns=['score', 'category'])
print df.head(10)
Output:
score category
0 0.649371 B
1 0.042309 B
2 0.689487 A
3 0.433064 B
4 0.978859 A
5 0.789140 C
6 0.215758 D
7 0.922389 B
8 0.105364 D
9 0.010274 C
If I try to plot this as a histogram using df.hist(by='category'), I get 4 graphs:
I managed to get the graph I wanted but I had to do a lot of manipulation.
# One column per category, 1 if maps to category, 0 otherwise
df2 = pd.DataFrame({
'score' : df.score,
'A' : (df.category == 'A').astype(float),
'B' : (df.category == 'B').astype(float),
'C' : (df.category == 'C').astype(float),
'D' : (df.category == 'D').astype(float)
},
columns=['score', 'A', 'B', 'C', 'D'])
# select "bins" of .1 width, and sum for each category
df3 = pd.DataFrame([df2[(df2.score >= (n/10.0)) & (df2.score < ((n+1)/10.0))].iloc[:, 1:].sum() for n in range(10)])
# Sum over series for weights
df4 = df3.sum(1)
bars = pd.DataFrame(df3.values / np.tile(df4.values, [4, 1]).transpose(), columns=list('ABCD'))
bars.plot.bar(stacked=True)
I expect there is a more straightforward way to do this, easier to read and understand and more optimized with less intermediate steps. Any solutions?
I dont know if this is really that much more compact or readable than what you already got but it is a suggestion (a late one as such :)).
import numpy as np
import pandas as pd
df = pd.DataFrame({
'score' : np.random.rand(1000),
'category' : np.random.choice(list('ABCD'), 1000)
}, columns=['score', 'category'])
# Set the range of the score as a category using pd.cut
df.set_index(pd.cut(df['score'], np.linspace(0, 1, 11)), inplace=True)
# Count all entries for all scores and all categories
a = df.groupby([df.index, 'category']).size()
# Normalize
b = df.groupby(df.index)['category'].count()
df_a = a.div(b, axis=0,level=0)
# Plot
df_a.unstack().plot.bar(stacked=True)
Consider assigning bins with cut, calculating grouping percentages with couple of groupby().transform calls, and then aggregate and reshape with pivot_table:
# CREATE BIN INDICATORS
df['plot_bins'] = pd.cut(df['score'], bins=np.arange(0,1.1,0.1),
labels=np.arange(0,1,0.1)).round(1)
# CALCULATE PCT OF CATEGORY OUT OF BINs
df['pct'] = (df.groupby(['plot_bins', 'category'])['score'].transform('count')
.div(df.groupby(['plot_bins'])['score'].transform('count')))
# PIVOT TO AGGREGATE + RESHAPE
agg_df = (df.pivot_table(index='plot_bins', columns='category', values='pct', aggfunc='max')
.reset_index(drop=True))
# PLOT
agg_df.plot(kind='bar', stacked=True, rot=0)

Categories

Resources