How to apply scipy.optimize.minimize on heston model calibrations - python

I am looking to calibrate the Heston model daily using scipy.optimize.minimize() over a period of time.
Some basic background information; I have collected information on 250.000 option trades over almost 4 years (so approx. 150 trades a day) and am looking to calibrate the heston model daily using option information on that specific day. I am however, quite new to nonlinear optimalization and even more so to scipy.optimize.minimize().
So far I have defined three functions:
the heston model function itself, which returns a dictonary with each day as key and the heston model estimated option price for each trade of that specific day as values;
the actual/observed option prices function, this function returns a dictonary in similar format as the heston model function.
the cost function, this function combines the two dictonaries of the previous functions and returns a list the sum of with all the squared differences for that specific date.
Now comes the problem, I tried to use scipy.optimize.minize() with my cost function, however I feel I have not correctly specified some parts of my three function so far in order to run the minimizer. Running scipy.optimize.minize() therefore expectly resulted in an error (picture below). It would be very much appreciate it somebody could give me some pointers on possible misspecification in my code.
picture of the dataframe: https://i.stack.imgur.com/2kkpl.png
picture of a small sample run: https://i.stack.imgur.com/LbSL5.png
picture of error when using scipy.optimize.minimize() https://i.stack.imgur.com/UuLIw.png
The code:
import pandas as pd
import datetime as dt
import time
import numpy as np
import QuantLib as ql
import scipy
from scipy.optimize import minimize
def hestonmodel(kap, the, sig, rho, init_vol):
dict1 = {}
for i in range(10):
if df_master.option_type[i] == "C":
payoff = ql.PlainVanillaPayoff(ql.Option.Call, df_master.strike_price[i])
else:
payoff = ql.PlainVanillaPayoff(ql.Option.Put, df_master.strike_price[i])
day_count = ql.Actual365Fixed()
calender = ql.NullCalendar()
experation_dates = ql.Date(df_master["maturity_date"][i],'%Y-%m-%d %H:%M:%S.%f')
calculation_dates = ql.Date(df_master["date"][i],'%Y-%m-%d %H:%M:%S.%f')
ql.Settings.instance().evaluationDate = calculation_dates
exercise = ql.EuropeanExercise(experation_dates)
option = ql.VanillaOption(payoff, exercise)
spot_price = df_master.index_price[i]
strike_price = df_master.strike_price[i]
riskfree_rate = df_master.risk_free_rate[i]
dividend = 0
variance = init_vol**2
initial_value = ql.QuoteHandle(ql.SimpleQuote(spot_price))
# Setting up flat risk free curves
discount_curve = ql.YieldTermStructureHandle(ql.FlatForward(calculation_dates, riskfree_rate,day_count))
dividend_yield = ql.YieldTermStructureHandle(ql.FlatForward(calculation_dates, dividend, day_count))
heston_process = ql.HestonProcess(discount_curve,dividend_yield, initial_value,variance,kap,the,sig,rho)
# Inputs used for the engine are model, Tolerance level, maximum evaluations
engine = ql.AnalyticHestonEngine(ql.HestonModel(heston_process),0.001,1000)
option.setPricingEngine(engine)
if i != 0:
if df_master.day_index[i] == df_master.day_index[i-1]:
dict1[df_master.day_index[i]].append(option.NPV())
else:
dict1[df_master.day_index[i]] = []
dict1[df_master.day_index[i]].append(option.NPV())
else:
dict1[df_master.day_index[i]] = []
dict1[df_master.day_index[i]].append(option.NPV())
return dict1
def actualpricefunction():
dict2 = {}
for i in range(10):
if i != 0:
if df_master.day_index[i] == df_master.day_index[i-1]:
dict2[df_master.day_index[i]].append(df_master.price[i])
else:
dict2[df_master.day_index[i]] = []
dict2[df_master.day_index[i]].append(df_master.price[i])
else:
dict2[df_master.day_index[i]] = []
dict2[df_master.day_index[i]].append(df_master.price[i])
return dict2
def costfunction(kap, the, sig, rho, init_vol):
dict1 = hestonmodel(kap, the, sig, rho, init_vol)
dict2 = actualpricefunction()
list1 = []
for i in dict1.keys():
list_temp1 = []
list_temp2 = []
d1 = dict1[i]
d2 = dict2[i]
for k in range(len(d1)):
result = pow((d1[k]-d2[k]),2)
list_temp1.append(result)
list_temp2 = sum(list_temp1)
list1.append(list_temp2)
return list1
The way I tried to run the scipy.optimize.minimize():
init_guess = (0.03,1,0.05,-0.6,0.03)
opt = si.optimize.minimize(costfunction(kap = 0.03,the = 1,sig = 0.05,rho=-0.6,init_vol=0.03), init_guess,method='Nelder-Mead', tol=1e-6)

Related

Big O notation : limited input

As an exercise, I am trying to set Monte Carlo Simulation on a chosen ticker symbol.
from numpy.random import randint
from datetime import date
from datetime import timedelta
import pandas as pd
import yfinance as yf
from math import log
# ticker symbol
ticker_input = "AAPL" # change
# start day + endday for Yahoo Finance API, 5 years of data
start_date = date.today()
end_date = start_date - timedelta(days=1826)
# retrieve data from Yahoo Finance
data = yf.download(ticker_input, end_date,start_date)
yf_data = data.reset_index()
# dataframe : define columns
df = pd.DataFrame(columns=['date', "ln_change", 'open_price', 'random_num'])
open_price = []
date_historical = []
for column in yf_data:
open_price = yf_data["Open"].values
date_historical = yf_data["Date"].values
# list order: descending
open_price[:] = open_price[::-1]
date_historical[:] = date_historical[::-1]
# Populate data into dataframe
for i in range(0, len(open_price)-1):
# date
day = date_historical[i]
# ln_change
lnc = log(open_price[i]/open_price[i+1], 2)
# random number
rnd = randint(1, 1258)
# op = (open_price[i]) open price
df.loc[i] = [day, open_price[i], lnc, rnd]
I was wondering how to calculate Big O if you have e.g. nested loops or exponential complexity but have a limited input like one in my example, maximum input size is 1259 instances of float number. Input size is not going to change.
How do you calculate code complexity in that scenario?
It is a matter of points of view. Both ways of seeing it are technically correct. The question is: What information do you wish to convey to the reader?
Consider the following code:
quadraticAlgorithm(n) {
for (i <- 1...n)
for (j <- 1...n)
doSomethingConstant();
}
quadraticAlgorithm(1000);
The function is clearly O(n2). And yet the program will always run in the same, constant time, because it just contains one function call with n=1000. It is still perfectly valid to refer to the function as O(n2). And we can refer to the program as O(1).
But sometimes the boundaries are not that clear. Then it is up to you to choose if you wish to see it as an algorithm with a time complexity as some function of n, or as a piece of constant code that runs in O(1). The importance is to make it clear to the reader how you define things.

The one-way ANOVA function I'm using keeps spitting out F values that don't make sense

I'm working on a project for college and it's kicking my ass.
I downloaded a data file from https://www.kaggle.com/datasets/majunbajun/himalayan-climbing-expeditions
I'm trying to use an ANOVA to see if there's a statistically significant difference in time taken to summit between the seasons.
The F value I'm getting back doesn't seem to make any sense. Any suggestions?
#import pandas
import pandas as pd
#import expeditions as csv file
exp = pd.read_csv('C:\\filepath\\expeditions.csv')
#extract only the data relating to everest
exp= exp[exp['peak_name'] == 'Everest']
#create a subset of the data only containing
exp_peaks = exp[['peak_name', 'member_deaths', 'termination_reason', 'hired_staff_deaths', 'year', 'season', 'basecamp_date', 'highpoint_date']]
#extract successful attempts
exp_peaks = exp_peaks[(exp_peaks['termination_reason'] == 'Success (main peak)')]
#drop missing values from basecamp_date & highpoint_date
exp_peaks = exp_peaks.dropna(subset=['basecamp_date', 'highpoint_date'])
#convert basecamp date to datetime
exp_peaks['basecamp_date'] = pd.to_datetime(exp_peaks['basecamp_date'])
#convert basecamp date to datetime
exp_peaks['highpoint_date'] = pd.to_datetime(exp_peaks['highpoint_date'])
from datetime import datetime
exp_peaks['time_taken'] = exp_peaks['highpoint_date'] - exp_peaks['basecamp_date']
#convert seasons from strings to ints
exp_peaks['season'] = exp_peaks['season'].replace('Spring', 1)
exp_peaks['season'] = exp_peaks['season'].replace('Autumn', 3)
exp_peaks['season'] = exp_peaks['season'].replace('Winter', 4)
#remove summer and unknown
exp_peaks = exp_peaks[(exp_peaks['season'] != 'Summer')]
exp_peaks = exp_peaks[(exp_peaks['season'] != 'Unknown')]
#subset the data according to the season
exp_peaks_spring = exp_peaks[exp_peaks['season'] == 1]
exp_peaks_autumn = exp_peaks[exp_peaks['season'] == 3]
exp_peaks_winter = exp_peaks[exp_peaks['season'] == 4]
#calculate the average time taken in spring
exp_peaks_spring_duration = exp_peaks_spring['time_taken']
mean_exp_peaks_spring_duration = exp_peaks_spring_duration.mean()
#calculate the average time taken in autumn
exp_peaks_autumn_duration = exp_peaks_autumn['time_taken']
mean_exp_peaks_autumn_duration = exp_peaks_autumn_duration.mean()
#calculate the average time taken in winter
exp_peaks_winter_duration = exp_peaks_winter['time_taken']
mean_exp_peaks_winter_duration = exp_peaks_winter_duration.mean()
# Turn the season column into a categorical
exp_peaks['season'] = exp_peaks['season'].astype('category')
exp_peaks['season'].dtypes
from scipy.stats import f_oneway
# One-way ANOVA
f_value, p_value = f_oneway(exp_peaks['season'], exp_peaks['time_taken'])
print("F-score: " + str(f_value))
print("p value: " + str(p_value))
It seems that f_oneway requires the different samples of continuous data to be arguments, rather than taking a categorical variable argument. You can achieve this using groupby.
f_oneway(*(group for _, group in exp_peaks.groupby("season")["time_taken"]))
Or equivalently, since you have already created series for each season:
f_oneway(exp_peaks_spring_duration, exp_peaks_autumn_duration, exp_peaks_winter_duration)
I would have thought there would be an easier way to perform an ANOVA in this common case but can't find it.

How to parallelize for loops in Pyspark?

I am trying to convert some Pandas code to Pyspark, which will run on an EMR cluster. This is my first time working with Pyspark, and I am not sure what is the optimal way to code the objective. The job is trying to achieve the following:
There is a base dataframe with schema like so:
institution_id, user_id, st_date
For every unique institution_id, get all users
For every user for the institution_id, take all unique st_dates in sorted order, get the difference between pairs of consecutive st_dates and output a dictionary
Here is what the code looks like as of now:
def process_user(current_user, inst_cycles):
current_user_dates = np.sort(current_user.st_date.unique())
if current_user_dates.size > 1:
prev_date = pd.to_datetime(current_user_dates[0]).date()
for current_datetime in current_user_dates[1:]:
current_date = pd.to_datetime(current_datetime).date()
month = current_date.month
delta = current_date - prev_date
cycle_days = delta.days
inst_cycles[month][cycle_days] += 1
prev_date = current_date
return inst_cycles
def get_inst_monthly_distribution(current_inst):
inst_cycles = defaultdict(lambda: defaultdict(int))
inst_user_ids = current_inst.select('user_id').distinct().collect()
for _, user_id in enumerate(inst_user_ids):
user_id_str = user_id[0]
current_user = current_inst.filter(current_inst.user_id == user_id_str)
inst_cycles = process_user(current_user, inst_cycles)
return inst_cycles
def get_monthly_distributions(inst_ids, df):
cycles = {}
for _, inst_id_str in enumerate(inst_ids.keys()):
current_inst = df.filter(df.inst_id == inst_id_str)
cycles[inst_id_str] = get_inst_monthly_distribution(current_inst)
return cycles
def execute():
df = load_data() # df is a Spark dataframe
inst_names = get_inst_names(df)
monthly_distributions = get_monthly_distributions(inst_names, df)
I think this code is not taking advantage of the parallelism of Spark, and can be coded in a much better way without the for loops. Is that correct?

How to vectorize this peak finding for loop in Python?

Basically I'm writing a peak finding function that needs to be able to beat scipy.argrelextrema in benchmarking. Here is a link to the data I'm using, and the code:
https://drive.google.com/open?id=1U-_xQRWPoyUXhQUhFgnM3ByGw-1VImKB
If this link expires, the data can be found at dukascopy bank's online historical data downloader.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('EUR_USD.csv')
data.columns = ['Date', 'open', 'high', 'low', 'close','volume']
data.Date = pd.to_datetime(data.Date, format='%d.%m.%Y %H:%M:%S.%f')
data = data.set_index(data.Date)
data = data[['open', 'high', 'low', 'close']]
data = data.drop_duplicates(keep=False)
price = data.close.values
def fft_detect(price, p=0.4):
trans = np.fft.rfft(price)
trans[round(p*len(trans)):] = 0
inv = np.fft.irfft(trans)
dy = np.gradient(inv)
peaks_idx = np.where(np.diff(np.sign(dy)) == -2)[0] + 1
valleys_idx = np.where(np.diff(np.sign(dy)) == 2)[0] + 1
patt_idx = list(peaks_idx) + list(valleys_idx)
patt_idx.sort()
label = [x for x in np.diff(np.sign(dy)) if x != 0]
# Look for Better Peaks
l = 2
new_inds = []
for i in range(0,len(patt_idx[:-1])):
search = np.arange(patt_idx[i]-(l+1),patt_idx[i]+(l+1))
if label[i] == -2:
idx = price[search].argmax()
elif label[i] == 2:
idx = price[search].argmin()
new_max = search[idx]
new_inds.append(new_max)
plt.plot(price)
plt.plot(inv)
plt.scatter(patt_idx,price[patt_idx])
plt.scatter(new_inds,price[new_inds],c='g')
plt.show()
return peaks_idx, price[peaks_idx]
It basically smoothes data using a fast fourier transform (FFT) then takes the derivative to find the minimum and maximum indices of the smoothed data, then finds the corresponding peaks on the unsmoothed data. Sometimes the peaks it finds are not idea due to some smoothing effects, so I run this for loop to search for higher or lower points for each index between the bounds specified by l. I need help vectorizing this for loop! I have no idea how to do it. Without the for loop, my code is about 50% faster than scipy.argrelextrema, but the for loop slows it down. So if I can find a way to vectorize it, it'd be a very quick, and very effective alternative to scipy.argrelextrema. These two images represent the data without and with the for loop respectively.
This may do it. It's not perfect but hopefully it obtains what you want and shows you a bit how to vectorize. Happy to hear any improvements you think up
label = np.array(label[:-1]) # not sure why this is 1 unit longer than search.shape[0]?
# the idea is to make the index matrix you're for looping over row by row all in one go.
# This part is sloppy and you can improve this generation.
search = np.vstack((np.arange(patt_idx[i]-(l+1),patt_idx[i]+(l+1)) for i in range(0,len(patt_idx[:-1])))) # you can refine this.
# then you can make the price matrix
price = price[search]
# and you can swap the sign of elements so you only need to do argmin instead of both argmin and argmax
price[label==-2] = - price[label==-2]
# now find the indices of the minimum price on each row
idx = np.argmin(price,axis=1)
# and then extract the refined indices from the search matrix
new_inds = search[np.arange(idx.shape[0]),idx] # this too can be cleaner.
# not sure what's going on here so that search[:,idx] doesn't work for me
# probably just a misunderstanding
I find that this reproduces your result but I did not time it. I suspect the search generation is quite slow but probably still faster than your for loop.
Edit:
Here's a better way to produce search:
patt_idx = np.array(patt_idx)
starts = patt_idx[:-1]-(l+1)
stops = patt_idx[:-1]+(l+1)
ds = stops-starts
s0 = stops.shape[0]
s1 = ds[0]
search = np.reshape(np.repeat(stops - ds.cumsum(), ds) + np.arange(ds.sum()),(s0,s1))
Here is an alternative... it uses list comprehension which is generally faster than for-loops
l = 2
# Define the bounds beforehand, its marginally faster than doing it in the loop
upper = np.array(patt_idx) + l + 1
lower = np.array(patt_idx) - l - 1
# List comprehension...
new_inds = [price[low:hi].argmax() + low if lab == -2 else
price[low:hi].argmin() + low
for low, hi, lab in zip(lower, upper, label)]
# Find maximum within each interval
new_max = price[new_inds]
new_global_max = np.max(new_max)

Time efficiency by eliminating three for loops

I have the a script similar to this:
import random
import pandas as pd
FA = []
FB = []
Value = []
df = pd.DataFrame()
df_save = pd.DataFrame(index=['min','max'])
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
numbers = list(range(24)) # FA.unique()
mix = '(pairwise combination of days and numbers, i.e. 0Monday,0Tuesday,...1Monday,1Tuesday,....)' 'I dont know how to do this combination btw'
def Calculus():
global min,max
min = df['Value'][boolean].min()
max = df['Value'][boolean].max()
for i in range(1000):
FA.append(random.randrange(0,23,1))
FB.append(random.choice(days))
Value.append(random.random())
df['FA'] = FA
df['FB'] = FB
df['FAB'] = df['FA'].astype(str) + df['FB'].astype(str)
df['Value'] = Value
mix_factor = df['FA'].astype(str) + df['FB'].astype(str)
for i in numbers:
boolean = df['FA'] == i
Calculus()
df_save[str(i)] = [min,max]
for i in days:
boolean = df['FB'] == i
Calculus()
df_save[str(i)] = [min,max]
for i in mix_factor.unique():
boolean = df['FAB'] == i
Calculus() #
df_save[str(i)] = [min,max]
My question is: there is another way to do the same but more time efficiently? My real data (df in this case) is a csv with millions of rows and this three loops are taking forever.
Maybe using 'apply' but I never have worked with it before.
Any insight will be very appreciate, thanks.
You could put all three loops into one, depending on what your exact code is. Is there a parameter for calculus? If not, putting them into one would allow you to have to run Calculus() less

Categories

Resources