I'm currently tasked with finding the average drawdown of 7 assets. This is what I have so far:
end = dt.datetime.today()
start = end - dt.timedelta(365)
tickers = ["SBUX", "MCD", "CMG", "WEN", "DPZ", "YUM", "DENN"]
bench = ['SPY', 'IWM', 'DIA']
table_1 = pd.DataFrame(index=tickers)
data = yf.download(tickers+bench, start, end)['Adj Close']
log_returns = np.log(data/data.shift())
table_1["drawdown"] = (log_returns.min() - log_returns.max() ) / log_returns.max()
However, this only gives me the maximum drawdown, when I actually want the average.
You will need scipy to find local max/min:
from scipy.signal import argrelextrema
I've defined a function that calculates the local min and max of the time series. Then simply calculate the relative difference between each local maximum and next local minimum and compute the mean:
def av_dd(series):
series = series.values # convert to numpy array
drawdowns = []
loc_max = argrelextrema(series, np.greater)[0] # getting indexes of local maximums
loc_min = argrelextrema(series, np.less)[0] # getting indexes of local minimums
# adding first value of series if first local minimum comes before first local maximum (you want the first drawdown to be taken into account)
if series[0]>series[1]:
loc_max = np.insert(loc_max,0,0)
# adding last value of series if last local maximum comes after last local minimum (you want the last drawdown to be taken into account)
if len(loc_max)>len(loc_min):
loc_min = np.append(loc_min, len(series)-1)
for i in range(len(loc_max)):
drawdowns.append(series[loc_min[i]]/series[loc_max[i]]-1)
return sum(drawdowns)/len(drawdowns)
Both if statements in the function are here to make sure that you also take into account the first and last drawdown depending what are the local extremas at the beginning and end of the time series.
You simply need to apply this function to your data time
table_1['drawdown'] = df.apply(lambda x: av_dd(x))
Related
I am trying to create a sliding window for a time series. So far I have a function that I managed to get working that lets you take a given series, set a window size in seconds and then create a rolling sample. My issue is that it is taking very long to run and seems like an inefficient approach.
# ========== create dataset =========================== #
import pandas as pd
from datetime import timedelta, datetime
timestamp_list = ["2022-02-07 11:38:08.625",
"2022-02-07 11:38:09.676",
"2022-02-07 11:38:10.084",
"2022-02-07 11:38:10.10000",
"2022-02-07 11:38:11.2320"]
bid_price_list = [1.14338,
1.14341,
1.14340,
1.1434334,
1.1534334]
df = pd.DataFrame.from_dict(zip(timestamp_list, bid_price_list))
df.columns = ['timestamp','value']
# make date time object
df.timestamp = [datetime.strptime(time_i, "%Y-%m-%d %H:%M:%S.%f") for time_i in df.timestamp]
df.head(3)
timestamp value timestamp_to_sec
0 2022-02-07 11:38:08.625 1.14338 2022-02-07 11:38:08
1 2022-02-07 11:38:09.676 1.14341 2022-02-07 11:38:09
2 2022-02-07 11:38:10.084 1.14340 2022-02-07 11:38:10
# ========== create rolling time-series function ====== #
# get the floor of time (second value)
df["timestamp_to_sec"] = df["timestamp"].dt.floor('s')
# set rollling window length in seconds
window_dt = pd.Timedelta(seconds=2)
# containers for rolling sample statistics
n_list = []
mean_list = []
std_list =[]
# add dt (window) seconds to the original time which was floored to the second
df["timestamp_to_sec_dt"] = df["timestamp_to_sec"] + window_dt
# get unique end times
time_unique_endlist = np.unique(df.timestamp_to_sec_dt)
# remove end times that are greater than the last actual time, i.e. max(df["timestamp_to_sec"])
time_unique_endlist = time_unique_endlist[time_unique_endlist <= max(df["timestamp_to_sec"])]
# loop running the sliding window (time_i is the end time of each window)
for time_i in time_unique_endlist:
# start time of each rolling window
start_time = time_i - window_dt
# sample for each time period of sliding window
rolling_sample = df[(df.timestamp >= start_time) & (df.timestamp <= time_i)]
# calculate the sample statistics
n_list.append(len(rolling_sample)) # store n observation count
mean_list.append(rolling_sample.mean()) # store rolling sample mean
std_list.append(rolling_sample.std()) # store rolling sample standard deviation
# plot histogram for each sample of the rolling sample
#plt.hist(rolling_sample.value, bins=10)
# tested and n_list brought back the correct values
>>> n_list
[2,3]
Is there a more efficient way of doing this, a way I could improve my interpretation or an open-source package that allows me to run a rolling window like this? I know that there is the .rolling() in pandas but that rolls on the values. I want something that I can use on unevenly-spaced data, using the time to define the fixed rolling window.
It seems like this is the best performance, hope it helps anyone else.
# set rollling window length in seconds
window_dt = pd.Timedelta(seconds=2)
# add dt seconds to the original timestep
df["timestamp_to_sec_dt"] = df["timestamp_to_sec"] + window_dt
# unique end time
time_unique_endlist = np.unique(df.timestamp_to_sec_dt)
# remove end values that are greater than the last actual value, i.e. max(df["timestamp_to_sec"])
time_unique_endlist = time_unique_endlist[time_unique_endlist <= max(df["timestamp_to_sec"])]
# containers for rolling sample statistics
mydic = {}
counter = 0
# loop running the rolling window
for time_i in time_unique_endlist:
start_time = time_i - window_dt
# sample for each time period of sliding window
rolling_sample = df[(df.timestamp >= start_time) & (df.timestamp <= time_i)]
# calculate the sample statistics
mydic[counter] = {
"sample_size":len(rolling_sample),
"sample_mean":rolling_sample["value"].mean(),
"sample_std":rolling_sample["value"].std()
}
counter = counter + 1
# results in a DataFrame
results = pd.DataFrame.from_dict(mydic).T
I am using the this dataset for a project.
I am trying to find the total yield for each inverter for the 34 day duration of the dataset (basically use the final and initial value available for each inverter). I have been able to get the list of inverters using pd.unique()(there are 22 inverters for each solar power plant.
I am having trouble querying the total_yield data for each inverter.
Here is what I have tried:
def get_yields(arr: np.ndarray, df:pd.core.frame.DataFrame) -> np.ndarray:
delta = np.zeros(len(arr))
index =0
for i in arr:
initial = df.loc[df["DATE_TIME"]=="15-05-2020 02:00"]
initial = initial.loc[initial["INVERTER_ID"]==i]
initial.reset_index(inplace=True,drop=True)
initial = initial.at[0,"TOTAL_YIELD"]
final = df.loc[(df["DATE_TIME"]=="17-06-2020 23:45")]
final = final.loc[final["INVERTER_ID"]==i]
final.reset_index(inplace=True, drop=True)
final = final.at[0,"TOTAL_YIELD"]
delta[index] = final - initial
index = index + 1
return delta
Reference: arr is the array of inverters, listed below. df is the generation dataframe for each plant.
The problem is that not every inverter has a data point for each interval. This makes this function only work for the inverters at the first plant, not the second one.
My second approach was to filter by the inverter first, then take the first and last data points. But I get an error- 'Series' objects are mutable, thus they cannot be hashed
Here is the code for that so far:
def get_yields2(arr: np.ndarray, df: pd.core.frame.DataFrame) -> np.ndarry:
delta = np.zeros(len(arr))
index = 0
for i in arr:
initial = df.loc(df["INVERTER_ID"] == i)
index += 1
break
return delta
List of inverters at plant 1 for reference(labeled as SOURCE_KEY):
['1BY6WEcLGh8j5v7' '1IF53ai7Xc0U56Y' '3PZuoBAID5Wc2HD' '7JYdWkrLSPkdwr4'
'McdE0feGgRqW7Ca' 'VHMLBKoKgIrUVDU' 'WRmjgnKYAwPKWDb' 'ZnxXDlPa8U1GXgE'
'ZoEaEvLYb1n2sOq' 'adLQvlD726eNBSB' 'bvBOhCH3iADSZry' 'iCRJl6heRkivqQ3'
'ih0vzX44oOqAx2f' 'pkci93gMrogZuBj' 'rGa61gmuvPhdLxV' 'sjndEbLyjtCKgGv'
'uHbuxQJl8lW7ozc' 'wCURE6d3bPkepu2' 'z9Y9gH1T5YWrNuG' 'zBIq5rxdHJRwDNY'
'zVJPv84UY57bAof' 'YxYtjZvoooNbGkE']
List of inverters at plant 2:
['4UPUqMRk7TRMgml' '81aHJ1q11NBPMrL' '9kRcWv60rDACzjR' 'Et9kgGMDl729KT4'
'IQ2d7wF4YD8zU1Q' 'LYwnQax7tkwH5Cb' 'LlT2YUhhzqhg5Sw' 'Mx2yZCDsyf6DPfv'
'NgDl19wMapZy17u' 'PeE6FRyGXUgsRhN' 'Qf4GUc1pJu5T6c6' 'Quc1TzYxW2pYoWX'
'V94E5Ben1TlhnDV' 'WcxssY2VbP4hApt' 'mqwcsP2rE7J0TFp' 'oZ35aAeoifZaQzV'
'oZZkBaNadn6DNKz' 'q49J1IKaHRwDQnt' 'rrq4fwE8jgrTyWY' 'vOuJvMaM2sgwLmb'
'xMbIugepa2P7lBB' 'xoJJ8DcxJEcupym']
Thank you very much.
I can't download the dataset to test this. Getting "To May Requests" Error.
However, you should be able to do this with a groupby.
import pandas as pd
result = df.groupby('INVERTER_ID')['TOTAL_YIELD'].agg(['max','min'])
result['delta'] = result['max']-result['min']
print(result[['delta']])
So if I'm understanding this right, what you want is the TOTAL_YIELD for each inverter for the beginning of the time period starting 5-05-2020 02:00 and ending 17-06-2020 23:45. Try this:
# enumerate lets you have an index value along with iterating through the array
for i, code in enumerate(arr):
# to filter the info to between the two dates, but not necessarily assuming that
# each inverter's data starts and ends at each date
inverter_df = df.loc[df['DATE_TIME'] >= pd.to_datetime('15-05-2020 02:00:00')]
inverter_df = inverter_df.loc[inverter_df['DATE_TIME'] <= pd.to_datetime('17-06-2020
23:45:00')]
inverter_df = inverter_df.loc[inverter_df["INVERTER_ID"]==code]]
# sort by date
inverter_df.sort_values(by='DATE_TIME', inplace= True)
# grab TOTAL_YIELD at the first available date
initial = inverter_df['TOTAL_YIELD'].iloc[0]
# grab TOTAL_YIELD at the last available date
final = inverter_df['TOTAL_YIELD'].iloc[-1]
delta[index] = final - initial
I have a set of EURUSD data and looking at arbitrage opportunities. The data is formatted as shown in photo.
mispricing_1=yes when buy_b_sell_A>0 and mispricing_2=yes when buy_A_sell_B>0
In the photo there is no datapoint where exploitable=yes however when the buy_b_sell_A>6 or when buy_A_sell_B>6, then we get exploitable=yes
I am looking to calculate the average length of time an exploitable arbitrage opportunity is present, shown by exploitable=yes
How can I calculate the length of time that there are consecutive exploitable=yes so that I can plot a distribution and then also calculate the average?
df=pd.DataFrame(data={'ts':list(range(1,14)),
'mp':[0,0,1,1,1,0,0,1,1,0,0,1,0]}) # your data
df.loc[df.mp.diff(1)==1, 'ts1'] = df.ts # TS1
df.loc[df.mp.diff(1)==-1, 'ts2'] = df.ts # TS2
df=df[~(df.ts1.isna())|~(df.ts2.isna())] # keep only rows with changes
df.loc[~df.ts2.isna(), 'delta'] = df.ts2 - df.ts1.shift(1) # TS2-TS1
print (df)
If you import this as a panda frame, which lets call it df, you can do df.groupby[‘exploitable’].mean
You could do .histogram or something for distribution.
I have a timestamp from a database and need to make a plot (rates vs time). I have the timestamps from when the process starts and ends, but I need to make the starting timestamp equal to 0 min (initial value) and the ending value equal to 20-30ish minutes (depending on the trial). I'm not sure what to use.
Also, I have the rates as a list and need to put them in an array for the matplotlib. I used np.asarray() and it says that the type is an array, but it is only giving me one number (the last number) on my plot. Any ideas on how to solve this?
code:
# timestamp comes out as 2.0080506043443555 e-16 because of the float
# need to change that into minutes for each run
L3time = L3timestamp.split()
del L3time[0]
for k in range(0, len(L3time)):
print(float(L3time[k]))
#print("this is the L3 rates")
for k in range(1, len(L3time)):
L3rate = (float(L3[k]) - float(L3[k-1]))*1000/(float(L3time[k]) - float(L3time[k-1]))
print(float(L3rate))
# putting the L3 rate into an array
L3RateArray = np.asarray(L3rate)
# putting the timestamp into an array
timestampArray = np.asarray(L3time)
for k in inFile.readlines():
plt.plot([timestampArray], [L3RateArray], 'ro')
plt.xlabel("time (m)")
plt.ylabel("L3 Rates (Hz)")
plt.suptitle("L3 Rates vs. Time")
plt.show()
I am trying to build an equity curve in Python using Pandas. For those not in the know, an equity curve is a cumulative tally of investing profits/losses day by day. The code below works but it is incredibly slow. I've tried to build an alternate using Pandas .iloc and such but nothing is working. I'm not sure if it is possible to do this outside of a loop given how I have to reference the prior row(s).
for today in range(len(f1)): #initiate a loop that runs the length of the "f1" dataframe
if today == 0: #if the index value is zero (aka first row in the dataframe) then...
f1.loc[today,'StartAUM'] = StartAUM #Set intial assets
f1.loc[today,'Shares'] = 0 #dummy placeholder for shares; no trading on day 1
f1.loc[today,'PnL'] = 0 #dummy placeholder for P&L; no trading day 1
f1.loc[today,'EndAUM'] = StartAUM #set ending AUM; should be beginning AUM since no trades
continue #and on to the second row in the dataframe
yesterday = today - 1 #used to reference the rows (see below)
f1.loc[today,'StartAUM'] = f1.loc[yesterday,'EndAUM'] #todays starting aseets are yesterday's ending assets
f1.loc[today,'Shares'] = f1.loc[yesterday,'EndAUM']//f1.loc[yesterday,'Shareprice'] #today's shares to trade = yesterday's assets/yesterday's share price
f1.loc[today,'PnL'] = f1.loc[today,'Shares']*f1.loc[today,'Outcome1'] #Our P&L should be the shares traded (see prior line) multiplied by the outcome for 1 share
#Note Outcome1 came from the dataframe before this loop >> for the purposes here it's value is irrelevant
f1.loc[today,'EndAUM'] = f1.loc[today,'StartAUM']+f1.loc[today,'PnL'] #ending assets are starting assets + today's P&L
There is a good example here: http://www.pythonforfinance.net/category/basic-data-analysis/ and I know that there is an example in Wes McKinney's book Python for Data Analysis. You might be able to find it here: http://wesmckinney.com/blog/python-for-financial-data-analysis-with-pandas/
Have you tried using iterrows() to construct the for loop?
for index, row in f1.iterrows():
if today == 0:
row['StartAUM'] = StartAUM #Set intial assets
row['Shares'] = 0 #dummy placeholder for shares; no trading on day 1
row['PnL'] = 0 #dummy placeholder for P&L; no trading day 1
row['EndAUM'] = StartAUM #set ending AUM; should be beginning AUM since no trades
continue #and on to the second row in the dataframe
yesterday = row[today] - 1 #used to reference the rows (see below)
row['StartAUM'] = row['EndAUM'] #todays starting aseets are yesterday's ending assets
row['Shares'] = row['EndAUM']//['Shareprice'] #today's shares to trade = yesterday's assets/yesterday's share price
row['PnL'] = row['Shares']*row['Outcome1'] #Our P&L should be the shares traded (see prior line) multiplied by the outcome for 1 share
#Note Outcome1 came from the dataframe before this loop >> for the purposes here it's value is irrelevant
row['EndAUM'] = row['StartAUM']+row['PnL'] #ending assets are starting assets + today's P&L
Probably the code is so slow as loc goes through f1 from beginning every time. iterrows() uses the same dataframe as it loops through it row by row.
See more details about iterrows() here.
You need to vectorize the operations (don't iterate with for but rather compute whole column at once)
# fill the initial values
f1['StartAUM'] = StartAUM # Set intial assets
f1['Shares'] = 0 # dummy placeholder for shares; no trading on day 1
f1['PnL'] = 0 # dummy placeholder for P&L; no trading day 1
f1['EndAUM'] = StartAUM # s
#do the computations (vectorized)
f1['StartAUM'].iloc[1:] = f1['EndAUM'].iloc[:-1]
f1['Shares'].iloc[1:] = f1['EndAUM'].iloc[:-1] // f1['Shareprice'].iloc[:-1]
f1['PnL'] = f1['Shares'] * f1['Outcome1']
f1['EndAUM'] = f1['StartAUM'] + f1 ['PnL']
EDIT: this will not work correctly since StartAUM, EndAUM, Shares depend on each other and cannot be computed one without another. I didn't notice that before.
Can you try the following:
#import relevant modules
import pandas as pd
import numpy as np
from pandas_datareader import data
import matplotlib.pyplot as plt
#download data into DataFrame and create moving averages columns
f1 = data.DataReader('AAPL', 'yahoo',start='1/1/2017')
StartAUM = 1000000
#populate DataFrame with starting values
f1['Shares'] = 0
f1['PnL'] = 0
f1['EndAUM'] = StartAUM
#Set shares held to be the previous day's EndAUM divided by the previous day's closing price
f1['Shares'] = f1['EndAUM'].shift(1) / f1['Adj Close'].shift(1)
#Set the day's PnL to be the number of shares held multiplied by the change in closing price from yesterday to today's close
f1['PnL'] = f1['Shares'] * (f1['Adj Close'] - f1['Adj Close'].shift(1))
#Set day's ending AUM to be previous days ending AUM plus daily PnL
f1['EndAUM'] = f1['EndAUM'].shift(1) + f1['PnL']
#Plot the equity curve
f1['EndAUM'].plot()
Does the above solve your issue?
The solution was to use the Numba package. It performs the loop task in a fraction of the time.
https://numba.pydata.org/
The arguments/dataframe can be passed to the numba module/function. I will try to write up a more detailed explanation with code when time permits.
Thanks to all
In case others come across this, you can definitely make an equity curve without loops.
Dummy up some data
import pandas as pd
import numpy as np
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (13, 10)
# Some data to work with
np.random.seed(1)
stock = pd.DataFrame(
np.random.randn(100).cumsum() + 10,
index=pd.date_range('1/1/2020', periods=100, freq='D'),
columns=['Close']
)
stock['ma_5'] = stock['Close'].rolling(5).mean()
stock['ma_15'] = stock['Close'].rolling(15).mean()
Holdings: simple long/short based on moving average crossover signals
longs = stock['Close'].where(stock['ma_5'] > stock['ma_15'], np.nan)
shorts = stock['Close'].where(stock['ma_5'] < stock['ma_15'], np.nan)
# Quick plot
stock.plot()
longs.plot(lw=5, c='green')
shorts.plot(lw=5, c='red')
EQUITY CURVE:
Identify which side (l/s) has first holding (ie: first trade, in this case, short), then keep the initial trade price and subsequently cumulatively sum the daily changes (there would normally be more nan's in the series if you have exit rules as well for when you are out of the market), and finally forward fill over the nan values and fill any last remaining nans with zeros. Its basically the same for the second opposite holdings (in this case, long) except don't keep the starting price. The other important thing is to invert the short daily changes (ie: negative changes should be positive to the PnL).
lidx = np.where(longs > 0)[0][0]
sidx = np.where(shorts > 0)[0][0]
startdx = min(lidx, sidx)
# For first holding side, keep first trade price, then calc daily change fwd and ffill nan's
# For second holdng side, get cumsum of daily changes, ffill and fillna(0) (make sure short changes are inverted)
if lidx == startdx:
lcurve = longs.diff() # get daily changes
lcurve[lidx] = longs[lidx] # put back initial starting price
lcurve = lcurve.cumsum().ffill() # add dialy changes/ffill to build curve
scurve = -shorts.diff().cumsum().ffill().fillna(0) # get daily changes (make declines positive changes)
else:
scurve = -shorts.diff() # get daily changes (make declines positive changes)
scurve[sidx] = shorts[sidx] # put back initial starting price
scurve = scurve.cumsum().ffill() # add dialy changes/ffill to build curve
lcurve = longs.diff().cumsum().ffill().fillna(0) # get daily changes
Add the 2 long/short curves together to get the final equity curve
eq_curve = lcurve + scurve
# quick plot
stock.iloc[:, :3].plot()
longs.plot(lw=5, c='green', label='Long')
shorts.plot(lw=5, c='red', label='Short')
eq_curve.plot(lw=2, ls='dotted', c='orange', label='Equity Curve')
plt.legend()