python panda to calculate rolling means

python panda to calculate rolling means - python

I am trying to calculate the bollinger band of facebook stock. But I found the rm_FB (the calculated rolling mean) are all nan
def get_rolling_mean(values, window):
"""Return rolling mean of given values, using specified window size."""
t = pd.date_range('2016-02-01', '2016-06-06', freq='D')
# print("Hey")
# print(values);
D = pd.Series(values, t)
return D.rolling(window=20,center=False).mean()
def test_run():
# Read data
dates = pd.date_range('2016-02-01', '2016-06-06')
symbols = ['FB']
df = get_data(symbols, dates)
# Compute Bollinger Bands
# 1. Compute rolling mean
rm_FB = get_rolling_mean(df['FB'], window=20)
print("Hey")
print(rm_FB)
if __name__ == "__main__":
test_run()

I was confused by how you asked. I manufactured the data and created a function I hope helps.
import pandas as pd
import numpy as np
def bollinger_bands(s, k=2, n=20):
"""get_bollinger_bands DataFrame
s is series of values
k is multiple of standard deviations
n is rolling window
"""
b = pd.concat([s, s.rolling(n).agg([np.mean, np.std])], axis=1)
b['upper'] = b['mean'] + b['std'] * k
b['lower'] = b['mean'] - b['std'] * k
return b.drop('std', axis=1)
Demonstration
np.random.seed([3,1415])
s = pd.Series(np.random.randn(100) / 100, name='price').add(1.001).cumprod()
bollinger_bands(s).plot()

Related

A faster way to compute percentage correlation between two filter functions

I wrote this function to compute the normalized percentage correlation between two filter functions (with one shifted). The function works but takes about 8 to 12 minutes depending on the number of elements in nbs. I would like to know if there is another way to make this operation faster. Here is my code below:
import numpy as np
DT = 0.08
def corr_g(*nbs, Np=10000, sf = 0.5):
wb = 0.25 # bandwidth in Hz
freq = (1/DT)*np.linspace(-0.5,0.5-1/Np,Np) # frequency vector
dCg_norms = np.zeros((Np,len(nbs)))
for idx, nb in enumerate(nbs): # nb is the filter parameter
d_k_vector = np.linspace(-Np*sf, Np*sf, Np) # indices vector
dCg = d_k_vector*0 # array to hold correlation
g = ((1+np.exp(-nb))**2)/((1+np.exp(-nb*(freq+wb)/wb))*(1+np.exp(nb*(freq-wb)/wb))) # filter function
for index2, d_k in enumerate(d_k_vector): # loop through the new indices vector
for index, sth in enumerate(g):
# form a new array from g using the indices vector use only values within the limits of g. Then do a dot product operation
if (index+d_k) < Np and (index+d_k) >= 0:
dCg[index2] += g[index] * g[index+int(d_k)]
dCg_norm = dCg/np.max(dCg)*100 # normalized correlation
dCg_norms[:,idx] = dCg_norm # add to allocated array
return dCg_norms
my_arr = corr_g(*[2,4,8,16])
import matplotlib.pyplot as plt
Np = 10000
DT = 0.08
d_k_vector = np.linspace(-5000, 5000, Np)
plt.plot(d_k_vector/(10000*DT)/0.25,my_arr[:,1])

You should not calculate correlation yourself, better use np.correlate(vector, 'same'). There are small differences between your result and mine and I am pretty sure error is on your side.
def corr_g2(*nbs, Np=10000, sf = 0.5):
wb = 0.25 # bandwidth in Hz
freq = (1/DT)*np.linspace(-0.5,0.5-1/Np,Np) # frequency vector
dCg_norms = np.zeros((Np,len(nbs)))
for idx, nb in enumerate(nbs): # nb is the filter parameter
g = ((1+np.exp(-nb))**2)/((1+np.exp(-nb*(freq+wb)/wb))*(1+np.exp(nb*(freq-wb)/wb))) # filter function
dCg = np.correlate(g, g, 'same')
dCg_norm = dCg/np.max(dCg)*100 # normalized correlation
dCg_norms[:,idx] = dCg_norm # add to allocated array
return dCg_norms
def main():
my_arr = corr_g(*[2,4], Np=Np)
my_arr2 = corr_g2(*[2,4], Np=Np)
# import matplotlib.pyplot as plt
# d_k_vector = np.linspace(-Np / 2, Np / 2 - 1, Np)
# plt.plot(d_k_vector/(10000*DT)/0.25,my_arr[:,1])
# plt.plot(d_k_vector/(10000*DT)/0.25,my_arr2[:,1])
# plt.show()
if __name__ == '__main__':
main()
Profiling results for Np=1000:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
39 #do_profile()
40 def main():
41 1 14419637.0 14419637.0 100.0 my_arr = corr_g(*[2,4], Np=Np)
42 1 1598.0 1598.0 0.0 my_arr2 = corr_g2(*[2,4], Np=Np)

Statsmodels OLS with rolling window problem

I would like to do a regression with a rolling window, but I got only one parameter back after the regression:
rolling_beta = sm.OLS(X2, X1, window_type='rolling', window=30).fit()
rolling_beta.params
The result:
X1 5.715089
dtype: float64
What could be the problem?
Thanks in advance, Roland

I think the problem is that the parameters window_type='rolling' and window=30 simply do not do anything. First I'll show you why, and at the end I'll provide a setup I've got lying around for linear regressions on rolling windows.
1. The problem with your function:
Since you haven't provided some sample data, here's a function that returns a dataframe of a desired size with some random numbers:
# Function to build synthetic data
import numpy as np
import pandas as pd
import statsmodels.api as sm
from collections import OrderedDict
def sample(rSeed, periodLength, colNames):
np.random.seed(rSeed)
date = pd.to_datetime("1st of Dec, 1999")
cols = OrderedDict()
for col in colNames:
cols[col] = np.random.normal(loc=0.0, scale=1.0, size=periodLength)
dates = date+pd.to_timedelta(np.arange(periodLength), 'D')
df = pd.DataFrame(cols, index = dates)
return(df)
Output:
X1 X2
2018-12-01 -1.085631 -1.294085
2018-12-02 0.997345 -1.038788
2018-12-03 0.282978 1.743712
2018-12-04 -1.506295 -0.798063
2018-12-05 -0.578600 0.029683
.
.
.
2019-01-17 0.412912 -1.363472
2019-01-18 0.978736 0.379401
2019-01-19 2.238143 -0.379176
Now, try:
rolling_beta = sm.OLS(df['X2'], df['X1'], window_type='rolling', window=30).fit()
rolling_beta.params
Output:
X1 -0.075784
dtype: float64
And this at least represents the structure of your output too, meaning that you're expecting an estimate for each of your sample windows, but instead you get a single estimate. So I looked around for some other examples using the same function online and in the statsmodels docs, but I was unable to find specific examples that actually worked. What I did find were a few discussions talking about how this functionality was deprecated a while ago. So then I tested the same function with some bogus input for the parameters:
rolling_beta = sm.OLS(df['X2'], df['X1'], window_type='amazing', window=3000000).fit()
rolling_beta.params
Output:
X1 -0.075784
dtype: float64
And as you can see, the estimates are the same, and no error messages are returned for the bogus input. So I suggest that you take a look at the function below. This is something I've put together to perform rolling regression estimates.
2. A function for regressions on rolling windows of a pandas dataframe
df = sample(rSeed = 123, colNames = ['X1', 'X2', 'X3'], periodLength = 50)
def RegressionRoll(df, subset, dependent, independent, const, win, parameters):
"""
RegressionRoll takes a dataframe, makes a subset of the data if you like,
and runs a series of regressions with a specified window length, and
returns a dataframe with BETA or R^2 for each window split of the data.
Parameters:
===========
df: pandas dataframe
subset: integer - has to be smaller than the size of the df
dependent: string that specifies name of denpendent variable
inependent: LIST of strings that specifies name of indenpendent variables
const: boolean - whether or not to include a constant term
win: integer - window length of each model
parameters: string that specifies which model parameters to return:
BETA or R^2
Example:
========
RegressionRoll(df=df, subset = 50, dependent = 'X1', independent = ['X2'],
const = True, parameters = 'beta', win = 30)
"""
# Data subset
if subset != 0:
df = df.tail(subset)
else:
df = df
# Loopinfo
end = df.shape[0]
win = win
rng = np.arange(start = win, stop = end, step = 1)
# Subset and store dataframes
frames = {}
n = 1
for i in rng:
df_temp = df.iloc[:i].tail(win)
newname = 'df' + str(n)
frames.update({newname: df_temp})
n += 1
# Analysis on subsets
df_results = pd.DataFrame()
for frame in frames:
#print(frames[frame])
# Rolling data frames
dfr = frames[frame]
y = dependent
x = independent
if const == True:
x = sm.add_constant(dfr[x])
model = sm.OLS(dfr[y], x).fit()
else:
model = sm.OLS(dfr[y], dfr[x]).fit()
if parameters == 'beta':
theParams = model.params[0:]
coefs = theParams.to_frame()
df_temp = pd.DataFrame(coefs.T)
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
if parameters == 'R2':
theParams = model.rsquared
df_temp = pd.DataFrame([theParams])
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_temp.columns = [', '.join(independent)]
df_results = pd.concat([df_results, df_temp], axis = 0)
return(df_results)
df_rolling = RegressionRoll(df=df, subset = 50, dependent = 'X1', independent = ['X2'], const = True, parameters = 'beta',
win = 30)
Output: A dataframe with beta estimates for OLS of X2 on X1 for each 30 period window of the data.
const X2
Date
2018-12-30 0.044042 0.032680
2018-12-31 0.074839 -0.023294
2019-01-01 -0.063200 0.077215
.
.
.
2019-01-16 -0.075938 -0.215108
2019-01-17 -0.143226 -0.215524
2019-01-18 -0.129202 -0.170304

Speed up rolling window in Pandas

I have this code which works fine and gives me the result I am looking for. It loops through a list of window sizes to create rolling aggregates for each metric in the sum_metric_list, min_metric_list and max_metric_list.
# create the rolling aggregations for each window
for window in constants.AGGREGATION_WINDOW:
# get the sum and count sums
sum_metrics_names_list = [x[6:] + "_1_" + str(window) for x in sum_metrics_list]
adt_df[sum_metrics_names_list] = adt_df.groupby('athlete_id')[sum_metrics_list].apply(lambda x : x.rolling(center = False, window = window, min_periods = 1).sum())
# get the min of mins
min_metrics_names_list = [x[6:] + "_1_" + str(window) for x in min_metrics_list]
adt_df[min_metrics_names_list] = adt_df.groupby('athlete_id')[min_metrics_list].apply(lambda x : x.rolling(center = False, window = window, min_periods = 1).min())
# get the max of max
max_metrics_names_list = [x[6:] + "_1_" + str(window) for x in max_metrics_list]
adt_df[max_metrics_names_list] = adt_df.groupby('athlete_id')[max_metrics_list].apply(lambda x : x.rolling(center = False, window = window, min_periods = 1).max())
It works well on small datasets but as soon as I run it on my full data with >3000 metrics and 40 windows it becomes very slow. Is there any way to optimise this code?

The benchmark (and code) below suggests that you can save a significant amount of time by using
df.groupby(...).rolling()
instead of
df.groupby(...)[col].apply(lambda x: x.rolling(...))
The main time-saving idea here is to try to apply vectorized functions (such as sum) to the largest possible array (or DataFrame) at one time (with one function call) instead of many tiny function calls.
df.groupby(...).rolling().sum() calls sum on each (grouped) sub-DataFrame. It
can compute the rolling sums for all the columns with one call.
You could use df[sum_metrics_list+[key]].groupby(key).rolling().sum() to compute the rolling/sum on the sum_metrics_list columns.
In contrast, df.groupby(...)[col].apply(lambda x: x.rolling(...)) calls sum on a single column of each (grouped) sub-DataFrame. Since you have >3000 metrics you end up calling df.groupby(...)[col].rolling().sum() (or min or max) 3000 times.
Of course, this pseudo-logic of counting the number of calls is only a heuristic which may guide you in the direction of faster code. The proof is in the pudding:
import collections
import timeit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def make_df(nrows=100, ncols=3):
seed = 2018
np.random.seed(seed)
df = pd.DataFrame(np.random.randint(10, size=(nrows, ncols)))
df['athlete_id'] = np.random.randint(10, size=nrows)
return df
def orig(df, key='athlete_id'):
columns = list(df.columns.difference([key]))
result = pd.DataFrame(index=df.index)
for window in range(2, 4):
for col in columns:
colname = 'sum_col{}_winsize{}'.format(col, window)
result[colname] = df.groupby(key)[col].apply(lambda x: x.rolling(
center=False, window=window, min_periods=1).sum())
colname = 'min_col{}_winsize{}'.format(col, window)
result[colname] = df.groupby(key)[col].apply(lambda x: x.rolling(
center=False, window=window, min_periods=1).min())
colname = 'max_col{}_winsize{}'.format(col, window)
result[colname] = df.groupby(key)[col].apply(lambda x: x.rolling(
center=False, window=window, min_periods=1).max())
result = pd.concat([df, result], axis=1)
return result
def alt(df, key='athlete_id'):
"""
Call rolling on the whole DataFrame, not each column separately
"""
columns = list(df.columns.difference([key]))
result = [df]
for window in range(2, 4):
rolled = df.groupby(key, group_keys=False).rolling(
center=False, window=window, min_periods=1)
new_df = rolled.sum().drop(key, axis=1)
new_df.columns = ['sum_col{}_winsize{}'.format(col, window) for col in columns]
result.append(new_df)
new_df = rolled.min().drop(key, axis=1)
new_df.columns = ['min_col{}_winsize{}'.format(col, window) for col in columns]
result.append(new_df)
new_df = rolled.max().drop(key, axis=1)
new_df.columns = ['max_col{}_winsize{}'.format(col, window) for col in columns]
result.append(new_df)
df = pd.concat(result, axis=1)
return df
timing = collections.defaultdict(list)
ncols = [3, 10, 20, 50, 100]
for n in ncols:
df = make_df(ncols=n)
timing['orig'].append(timeit.timeit(
'orig(df)',
'from __main__ import orig, alt, df',
number=10))
timing['alt'].append(timeit.timeit(
'alt(df)',
'from __main__ import orig, alt, df',
number=10))
plt.plot(ncols, timing['orig'], label='using groupby/apply (orig)')
plt.plot(ncols, timing['alt'], label='using groupby/rolling (alternative)')
plt.legend(loc='best')
plt.xlabel('number of columns')
plt.ylabel('seconds')
print(pd.DataFrame(timing, index=pd.Series(ncols, name='ncols')))
plt.show()
and yields these timeit benchmarks
alt orig
ncols
3 0.871695 0.996862
10 0.991617 3.307021
20 1.168522 6.602289
50 1.676441 16.558673
100 2.521121 33.261957
The speed advantage of alt compared to orig seems to increase as the number of columns increases.

Calculating XIRR in Python

I need to calculate XIRR of financial investments made over a period of time. Is there any function to do this in numpy, pandas or plain python?
Reference: What is XIRR?
The accepted answer in the original question is not correct and can be improved.

Created a package for fast XIRR calculation, PyXIRR
It doesn't have external dependencies and works faster than any existing implementation.
from datetime import date
from pyxirr import xirr
dates = [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)]
amounts = [-1000, 1000, 1000]
# feed columnar data
xirr(dates, amounts)
# feed tuples
xirr(zip(dates, amounts))
# feed DataFrame
import pandas as pd
xirr(pd.DataFrame({"dates": dates, "amounts": amounts}))

Here's an implementation taken from here.
import datetime
from scipy import optimize
def xnpv(rate,cashflows):
chron_order = sorted(cashflows, key = lambda x: x[0])
t0 = chron_order[0][0]
return sum([cf/(1+rate)**((t-t0).days/365.0) for (t,cf) in chron_order])
def xirr(cashflows,guess=0.1):
return optimize.newton(lambda r: xnpv(r,cashflows),guess)

This implementation calculates the time delta once and then vectorizes the NPV calculation. It should run much faster than #pyCthon's solution for larger datasets. The input is a pandas series of cashflows with dates for the index.
Code
import pandas as pd
import numpy as np
from scipy import optimize
def xirr2(valuesPerDate):
""" Calculate the irregular rate of return.
valuesPerDate is a pandas series of cashflows with index of dates.
"""
# Clean values
valuesPerDateCleaned = valuesPerDate[valuesPerDate != 0]
# Check for sign change
if valuesPerDateCleaned.min() * valuesPerDateCleaned.max() >= 0:
return np.nan
# Set index to time delta in years
valuesPerDateCleaned.index = (valuesPerDateCleaned.index - valuesPerDateCleaned.index.min()).days / 365.0
result = np.nan
try:
result = optimize.newton(lambda r: (valuesPerDateCleaned / ((1 + r) ** valuesPerDateCleaned.index)).sum(), x0=0, rtol=1e-4)
except (RuntimeError, OverflowError):
result = optimize.brentq(lambda r: (valuesPerDateCleaned / ((1 + r) ** valuesPerDateCleaned.index)).sum(), a=-0.999999999999999, b=100, maxiter=10**4)
if not isinstance(result, complex):
return result
else:
return np.nan
Tests
valuesPerDate = pd.Series()
for d in pd.date_range(start='1990-01-01', end='2019-12-31', freq='M'):
valuesPerDate[d] = 10*np.random.uniform(-0.5,1)
valuesPerDate[0] = -100
print(xirr2(valuesPerDate))

Appending function created column to an existing data frame

I currently have a dataframe as below:
and wish to add a column, E, that is calculated based on the following function.
def geometric_brownian_motion(T = 1, N = 100, mu = 0.1, sigma = 0.01, S0 = 20):
dt = float(T)/N
t = np.linspace(0, T, N)
W = np.random.standard_normal(size = N)
W = np.cumsum(W)*np.sqrt(dt) ### standard brownian motion ###
X = (mu-0.5*sigma**2)*t + sigma*W
S = S0*np.exp(X) ### geometric brownian motion ###
return S
(originating from here)
How to i create a time-series for all of the dates contained within the data-frame and append it?
The function input parameters are as follows:
T = (#days between df row 1 and df last)/365
N = # rows in data frame
S0 = 100

As i understand the essense of question is how to apply some method to every column, taking into account, the fact that to calculate a new value you need an index from dataframe:
I suggest you to extract index as separate column and use apply as usually.
from functools import partial
df['index'] = df.index
T = # precalculate T here
N = df.shape[0]
applying_method = partial(geometric_brownian_motion,T=T,N=N, S0=100)
df['E'] = df.apply(lambda row: applying_method(*row),axis=1)
Or if you rename columns of dataframe accroding to you function arguments:
df['E'] = df.apply(lambda row: applying_method(**row),axis=1)
Hope that helps.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python panda to calculate rolling means - python

Related

A faster way to compute percentage correlation between two filter functions

Statsmodels OLS with rolling window problem

Speed up rolling window in Pandas

Calculating XIRR in Python

Appending function created column to an existing data frame

Categories

Resources