Let's say I have the following dataframe:
df = pd.DataFrame([[0.1,0],[0.2,1],[0.3,1],[0.4,0]], columns = ['score', 'correct_pred'])
score correct_pred
0 0.1 0
1 0.2 1
2 0.3 1
3 0.4 0
And I would like for each row to compute the proportion of lines with score below and the proportion of correct_pred with score equal or above.
That is, for the second row for instance, 25% of the rows have a score below 0.2 and 66% of rows equal or above 0.2 have a correct pred. The output would then look like:
threshold percentage_filtered percentage_correct_pred
0.1 0 0.5
0.2 0.25 0.66
0.3 0.5 0.5
0.4 0.75 0
So far I do it using this piece of code:
out = pd.DataFrame(columns = ['threshold', 'percentage_filtered', 'percentage_correct_pred'])
for threshold in df.score:
threshold_mask = df.score < threshold
out.loc[len(out)] = [threshold,
np.mean(threshold_mask),
df[~threshold_mask].correct_pred.mean()]
Which works, but it is terribly slow on a real-size dataframe. So I need a faster version, I suspect there is a more vectorized method, maybe using numpy.cumsum or something?
I will assume that score may have repeated values, but if it does not it would also work (although it could be simpler). This is a way to get that result:
import pandas as pd
import numpy as np
df = pd.DataFrame([[0.1, 0], [0.2, 1], [0.3, 1], [0.4, 0]],
columns=['score', 'correct_pred'])
# Group by scores and count occurrences and number of correct predictions
df2 = (df.sort_values('score')
.groupby('score')['correct_pred']
.agg(['count', 'sum'])
.reset_index())
# Percentage of values below each threshold
perc_filtered = df2['count'].shift(1).fillna(0).cumsum() / df2['count'].sum()
# Percentage of values above each threshold with correct prediction
perc_correct_pred = df2['sum'][::-1].cumsum()[::-1] / df2['count'][::-1].cumsum()[::-1]
# Assemble result
result = pd.concat([df2['score'], perc_filtered, perc_correct_pred], axis=1)
result.columns = ['threshold', 'percentage_filtered', 'percentage_correct_pred']
print(result)
# threshold percentage_filtered percentage_correct_pred
# 0 0.1 0.00 0.500000
# 1 0.2 0.25 0.666667
# 2 0.3 0.50 0.500000
# 3 0.4 0.75 0.000000
Performance:
np.random.seed(123)
df = pd.DataFrame({'score': np.arange(0, 1, 0.0005),
'correct_pred':np.random.choice([1,0], size=2000)
})
print (df)
score correct_pred
0 0.0000 1
1 0.0005 0
2 0.0010 1
3 0.0015 1
4 0.0020 1
... ...
1995 0.9975 0
1996 0.9980 0
1997 0.9985 1
1998 0.9990 1
1999 0.9995 1
[2000 rows x 2 columns]
In [208]: %timeit do_it_jdehesa()
9.57 ms ± 317 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [209]: %timeit do_it()
5.83 s ± 181 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [210]: %timeit do_it1()
3.21 s ± 203 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [211]: %timeit do_it2()
92.5 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Lets improve the runtime by a factor of 10.
For reference:
df = pd.DataFrame([[0.1,0],[0.2,1],[0.3,1],[0.4,0]], columns = ['score', 'correct_pred'])
def do_it():
out = pd.DataFrame(columns = ['threshold', 'percentage_filtered', 'percentage_correct_pred'])
for threshold in df.score:
threshold_mask = df.score < threshold
out.loc[len(out)] = [threshold,
np.mean(threshold_mask),
df[~threshold_mask].correct_pred.mean()]
%timeit do_it()
13 ms ± 607 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Firstly, we take all call of pandas methods out of the loop such as:
def do_it1():
score_values = df.score.values
score_list = list(set(score_values))
correct_pred = df.correct_pred.values
out = pd.DataFrame(columns = ['threshold', 'percentage_filtered', 'percentage_correct_pred'])
for threshold in score_list:
mask = score_values < threshold
out.loc[len(out)] = [threshold,
np.mean(mask),
np.mean(correct_pred[~mask])]
%timeit do_it1()
9.67 ms ± 331 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Then we also create the dataframe only after getting the results
def do_it2():
score_values = df.score.values
score_list = list(set(score_values))
correct_pred = df.correct_pred.values
result = []
for threshold in score_list:
mask = score_values < threshold
result.append((threshold,np.mean(mask),np.mean(correct_pred[~mask])))
out = pd.DataFrame(result, columns = ['threshold', 'percentage_filtered', 'percentage_correct_pred'])
%timeit do_it2()
960 µs ± 16.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
EDIT:
To take jdehesas answer into account:
df = pd.DataFrame([[0.1, 0], [0.2, 1], [0.3, 1], [0.4, 0]],
columns=['score', 'correct_pred'])
def do_it_jdehesa():
# Group by scores and count occurrences and number of correct predictions
df2 = (df.sort_values('score')
.groupby('score')['correct_pred']
.agg(['count', 'sum'])
.reset_index())
# Percentage of values below each threshold
perc_filtered = df2['count'].shift(1).fillna(0).cumsum() / df2['count'].sum()
# Percentage of values above each threshold with correct prediction
perc_correct_pred = df2['sum'][::-1].cumsum()[::-1] / df2['count'][::-1].cumsum()[::-1]
# Assemble result
result = pd.concat([df2['score'], perc_filtered, perc_correct_pred], axis=1)
result.columns = ['threshold', 'percentage_filtered', 'percentage_correct_pred']
%timeit do_it_jdehesa()
13.5 ms ± 997 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
EDIT2: Just optimizing the cuntion a little more - yet no where near as fast as jdehesas answer
def do_it5():
dfarray = df.values
n = dfarray.size
score_values = dfarray[:,0]
score_list = np.unique(score_values)
correct_pred = dfarray[:,1]
result = []
for threshold in score_list:
mask = score_values<threshold
result.append((threshold, np.count_nonzero(mask)/n, np.count_nonzero(correct_pred[~mask])/n))
result = pd.DataFrame(result, columns = ['threshold', 'percentage_filtered', 'percentage_correct_pred'])
Related
I have a dataframe as below:
0.1 0.65
0.2 0.664
0.3 0.606
0.4 0.587
0.5 0.602
0.6 0.59
0.7 0.53
I have to find the first occurence below 0.6 in column 2 and return the value of the column 1 on same row. In that example the returned value would be 0.4.
How could I do this using Numpy or SciPy ?
the code is:
import pandas as pd
df = pd.DataFrame([*zip([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], [0.65, 0.664, 0.606 ,0.587 ,0.602,0.59,0.53])])
threshold = 0.6
var = df[df[1] < threshold].head(1)[0]
res = var.iloc[0]
You can use masking and the df.head() function to get the first occurrence given the threshold.
df[df[1] < threshold].head(1)[0]
3 0.4
Name: 0, dtype: float64
Update
To use numpy, you need to convert the pandas to numpy and use np.where.
array = df.values
array[np.where(array[:,1] < 0.6)][0,0]
0.4
To compare the performance, we will time the two sets of codes.
# Pandas style
def function1(df):
return df[df[1] < threshold].head(1)[0]
# Numpy style
def function2(df):
array = df.values
return array[np.where(array[:,1] < 0.6)][0,0]
%timeit function1(df)
322 µs ± 6.71 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit function2(df)
11.8 µs ± 209 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
I'm considering a multiindexed (i,j,k) DataFrame with one column that contains three categorical variables A, B or C.
I want to compute the frequency of the variables for each i over all (j,k). I have a solution, but I think there exists a more pythonic and efficient way of doing it.
The code for a MWE reads (in reality len(I)*len(J)*len(K) is large, in the millions, say):
import pandas as pd
import numpy as np
I = range(10)
J = range(3)
K = range(2)
data = pd.DataFrame(
np.random.randint(0, 3, size=len(I)*len(J)*len(K)),
index=pd.MultiIndex.from_product([I, J, K]),
columns=['cat']
)
data.index.names = ['i', 'j', 'k']
data[data['cat'] == 0] = 'A'
data[data['cat'] == 1] = 'B'
data[data['cat'] == 2] = 'C'
data = data.unstack(['j', 'k'])
result = data.apply(lambda x: x.value_counts(), axis=1).fillna(0) / (len(J)*len(K))
You could use groupby, and also normalize your value_counts:
data.groupby(level=0)['cat'].value_counts(normalize=True).unstack(level=1).fillna(0)
To compare, first let's make the dummy data big (60 million rows):
import pandas as pd
import numpy as np
I = range(100000)
J = range(30)
K = range(20)
data = pd.DataFrame(
np.random.randint(0, 3, size=len(I)*len(J)*len(K)),
index=pd.MultiIndex.from_product([I, J, K]),
columns=['cat']
)
data.index.names = ['i', 'j', 'k']
data[data['cat'] == 0] = 'A'
data[data['cat'] == 1] = 'B'
data[data['cat'] == 2] = 'C'
Timing your original method:
data_interim = data.unstack(['j', 'k'])
data_interim.apply(lambda x: x.value_counts(), axis=1).fillna(0) / (len(J)*len(K))
gives (on my machine) 1min 24s ± 1.98 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
the alternative:
data.groupby(level=0)['cat'].value_counts(normalize=True).unstack(level=1).fillna(0)
gives (on my machine) 8.86 s ± 216 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Baseline for s_pike's method:
%%timeit
(data.groupby(level=0)['cat']
.value_counts(normalize=True)
.unstack(level=1)
.fillna(0))
6.41 s ± 243 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
If they're truly categorical, you can get a lot of benefit out of explicitly making the column categorical, and then using either of these methods.
They're both still about twice as fast without being categorical, but become about 3x as fast when made categorical.
data['cat'] = data['cat'].astype('category')
%%timeit
(data.groupby(level=0, as_index=False)['cat']
.value_counts(normalize=True)
.pivot(index='i', columns='cat', values='proportion'))
1.82 s ± 91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
(x := data.pivot_table(index='i', columns='cat', aggfunc='value_counts')).div(x.sum(1), 0)
1.8 s ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Outputs:
cat A B C
i
0 0.341667 0.318333 0.340000
1 0.311667 0.388333 0.300000
2 0.351667 0.350000 0.298333
3 0.363333 0.333333 0.303333
4 0.326667 0.350000 0.323333
... ... ... ...
99995 0.315000 0.313333 0.371667
99996 0.323333 0.351667 0.325000
99997 0.305000 0.353333 0.341667
99998 0.318333 0.341667 0.340000
99999 0.331667 0.340000 0.328333
[100000 rows x 3 columns]
I want to calculate daily bond returns from clean prices based on the logarithm of the bond price in t divided by the bond price in t-1. So far, I calculate it like this:
import pandas as pd
import numpy as np
#create example data
col1 = np.random.randint(0,10,size=10)
df = pd.DataFrame()
df["col1"] = col1
df["result"] = [0]*len(df)
#slow computation
for i in range(len(df)):
if i == 0:
df["result"][i] = np.nan
else:
df["result"][i] = np.log(df["col1"][i]/df["col1"][i-1])
However, since I have a large sample this takes a lot of time to compute. Is there a way to improve the code in order to make it faster?
Use Series.shift by col1 column with Series.div for division:
df["result1"] = np.log(df["col1"].div(df["col1"].shift()))
#alternative
#df["result1"] = np.log(df["col1"] / df["col1"].shift())
print (df)
col1 result result1
0 5 NaN NaN
1 0 -inf -inf
2 3 inf inf
3 3 0.000000 0.000000
4 7 0.847298 0.847298
5 9 0.251314 0.251314
6 3 -1.098612 -1.098612
7 5 0.510826 0.510826
8 2 -0.916291 -0.916291
9 4 0.693147 0.693147
I test both solutions:
np.random.seed(0)
col1 = np.random.randint(0,10,size=10000)
df = pd.DataFrame({'col1':col1})
In [128]: %timeit df["result1"] = np.log(df["col1"] / df["col1"].shift())
865 µs ± 139 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [129]: %timeit df.assign(result=lambda x: np.log(x.col1.pct_change() + 1))
1.16 ms ± 11.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [130]: %timeit df["result1"] = np.log(df["col1"].pct_change() + 1)
1.03 ms ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
np.random.seed(0)
col1 = np.random.randint(0,10,size=100000)
df = pd.DataFrame({'col1':col1})
In [132]: %timeit df["result1"] = np.log(df["col1"] / df["col1"].shift())
3.7 ms ± 189 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [133]: %timeit df.assign(result=lambda x: np.log(x.col1.pct_change() + 1))
6.31 ms ± 545 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [134]: %timeit df["result1"] = np.log(df["col1"].pct_change() + 1)
3.75 ms ± 269 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
No need to use multiple functions, use Series.pct_change():
df = df.assign(
result=lambda x: np.log(x.col1.pct_change() + 1)
)
print(df)
col1 result
0 3 NaN
1 5 0.510826
2 8 0.470004
3 7 -0.133531
4 9 0.251314
5 1 -2.197225
6 1 0.000000
7 2 0.693147
8 7 1.252763
9 0 -inf
This should be a much faster way to get the same results:
df["result_2"] = np.log(df["col1"] / df["col1"].shift())
hi I have the following Dataframe
import pandas as pd
d = {'col1': [0.02,0.12,-0.1,0-0.07,0.01]}
df = pd.DataFrame(data=d)
df['new'] = ''
df['new'].iloc[0] = 100
df
I tried to calculate (beginning in row 1) in column 'new' the previous value divided by the value of 'col1'+ 1.
For example in row one, column new: 100/(0.12+1) = 89,285
For example in row two, column new: 89,285/(-0.10+1) = 99,206
and so on
I already tried to use a lambda function - without success. Thanks for help
Try this:
df['new'].iloc[0] = 100
for i in range(1,df.shape[0]):
prev = df['new'].iloc[i-1]
df['new'].iloc[i] = prev/(df['col1'].iloc[i]+1)
Output:
col1 new
-------------------
0 0.02 100
1 0.12 89.2857
2 -0.10 99.2063
3 -0.07 106.673
4 0.01 105.617
I think numba is way how working with loops here if performance is important:
from numba import jit
d = {'col1': [0.02,0.12,-0.1,0-0.07,0.01]}
df = pd.DataFrame(data=d)
df.loc[0, 'new'] = 100
#jit(nopython=True)
def f(a, b):
for i in range(1, a.shape[0]):
a[i] = a[i-1] / (b[i] +1)
return a
df['new'] = f(df['new'].to_numpy(), df['col1'].to_numpy())
print (df)
col1 new
0 0.02 100.000000
1 0.12 89.285714
2 -0.10 99.206349
3 -0.07 106.673494
4 0.01 105.617321
Performance for 5000 rows:
d = {'col1': [0.02,0.12,-0.1,0-0.07,0.01]}
df = pd.DataFrame(data=d)
df = pd.concat([df] * 1000, ignore_index=True)
In [168]: %timeit df['new'] = f(df['new'].to_numpy(), df['col1'].to_numpy())
277 µs ± 11.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [169]: %%timeit
...: for i in range(1,df.shape[0]):
...: prev = df['new'].iloc[i-1]
...: df['new'].iloc[i] = prev/(df['col1'].iloc[i]+1)
...:
1.31 s ± 20.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [170]: %%timeit
...: for i_row, row in df.iloc[1:, ].iterrows():
...: df.loc[i_row, 'new'] = df.loc[i_row - 1, 'new'] / (row['col1'] + 1)
...:
2.08 s ± 93.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
I don't see any vectorized solution. Here's a sheer loop:
df['new'] = 100
for i_row, row in df.iloc[1:, ].iterrows():
df.loc[i_row, 'new'] = df.loc[i_row - 1, 'new'] / (row['col1'] + 1)
Suppose we have a table of customers and their spending.
import pandas as pd
df = pd.DataFrame({
"Name": ["Alice", "Bob", "Bob", "Charles"],
"Spend": [3, 5, 7, 9]
})
LIMIT = 6
For each customer, we may compute the fraction of his spending that is larger than 6, using the apply method:
df.groupby("Name").apply(
lambda grp: len(grp[grp["Spend"] > LIMIT]) / len(grp)
)
Name
Alice 0.0
Bob 0.5
Charles 1.0
However, the apply method is just a loop, which is slow if there are many customers.
Question: Is there a faster way, which presumably uses vectorization?
As of version 0.23.4, SeriesGroupBy does not support comparison operators:
(df.groupby("Name") ["Spend"] > LIMIT).mean()
TypeError: '>' not supported between instances of 'SeriesGroupBy' and 'int'
The code below results in a null value for Alice:
df[df["Spend"] > LIMIT].groupby("Name").size() / df.groupby("Name").size()
Name
Alice NaN
Bob 0.5
Charles 1.0
The code below gives the correct result, but it requires us to either modify the table, or make a copy to avoid modifying the original.
df["Dummy"] = 1 * (df["Spend"] > LIMIT)
df.groupby("Name") ["Dummy"] .sum() / df.groupby("Name").size()
Groupby does not use vectorization, but it has aggregate functions that are optimized with Cython.
You can take the mean:
(df["Spend"] > LIMIT).groupby(df["Name"]).mean()
df["Spend"].gt(LIMIT).groupby(df["Name"]).mean()
Or use div to replace NaN with 0:
df[df["Spend"] > LIMIT].groupby("Name").size() \
.div(df.groupby("Name").size(), fill_value = 0)
df["Spend"].gt(LIMIT).groupby(df["Name"]).sum() \
.div(df.groupby("Name").size(), fill_value = 0)
Each of the above will yield
Name
Alice 0.0
Bob 0.5
Charles 1.0
dtype: float64
Performance
Depends on the number of rows and number of rows filtered per condition, so it's best to test on real data.
np.random.seed(123)
N = 100000
df = pd.DataFrame({
"Name": np.random.randint(1000, size = N),
"Spend": np.random.randint(10, size = N)
})
LIMIT = 6
In [10]: %timeit df["Spend"].gt(LIMIT).groupby(df["Name"]).mean()
6.16 ms ± 332 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [11]: %timeit df[df["Spend"] > LIMIT].groupby("Name").size().div(df.groupby("Name").size(), fill_value = 0)
6.35 ms ± 95.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [12]: %timeit df["Spend"].gt(LIMIT).groupby(df["Name"]).sum().div(df.groupby("Name").size(), fill_value = 0)
9.66 ms ± 365 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# RafaelC comment solution
In [13]: %timeit df.groupby("Name")["Spend"].apply(lambda s: (s > LIMIT).sum() / s.size)
400 ms ± 27.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [14]: %timeit df.groupby("Name")["Spend"].apply(lambda s: (s > LIMIT).mean())
328 ms ± 6.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
This NumPy solution is vectorized, but a bit complicated:
In [15]: %%timeit
...: i, r = pd.factorize(df["Name"])
...: a = pd.Series(np.bincount(i), index = r)
...:
...: i1, r1 = pd.factorize(df["Name"].values[df["Spend"].values > LIMIT])
...: b = pd.Series(np.bincount(i1), index = r1)
...:
...: df1 = b.div(a, fill_value = 0)
...:
5.05 ms ± 82.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)