How to avoid sigkill error 9? - python

I am trying to build an algorithm which first builds a power set of around 100 symbols excluding null set and repeated elements.
Then for each item in the list of power set it reads data file and evaluates the Sharpe Ratio (Return/Risk).
Results are then appended to a list and at last the program gives the best combination of symbols that would result in highest Sharpe Ratio.
Following is the code:
import pandas as pd
import numpy as np
import math
from itertools import chain, combinations
import operator
import time as t
#ASSUMPTION
#EQUAL ALLOCATION OF RESOURCES
t0 = t.time()
start_date = '2016-06-01'
end_date = '2017-08-18'
allocation = 170000
usesymbols=['PAEL','TPL','SING','DCL','POWER','FCCL','DGKC','LUCK',
'THCCL','PIOC','GWLC','CHCC','MLCF','FLYNG','EPCL',
'LOTCHEM','SPL','DOL','NRSL','AGL','GGL','ICL','AKZO','ICI',
'WAHN','BAPL','FFC','EFERT','FFBL','ENGRO','AHCL','FATIMA',
'EFOODS','QUICE','ASC','TREET','ZIL','FFL','CLOV',
'BGL','STCL','GGGL','TGL','GHGL','OGDC','POL','PPL','MARI',
'SSGC','SNGP','HTL','PSO','SHEL','APL','HASCOL','RPL','MERIT',
'GLAXO','SEARL','FEROZ','HINOON','ABOT','KEL','JPGL','EPQL',
'HUBC','PKGP','NCPL','LPL','KAPCO','TSPL','ATRL','BYCO','NRL','PRL',
'DWSM','SML','MZSM','IMSL','SKRS','HWQS','DSFL','TRG','PTC','TELE',
'WTL','MDTL','AVN','NETSOL','SYS','HUMNL','PAKD',
'ANL','CRTM','NML','NCL','GATM','CLCPS','GFIL','CHBL',
'DFSM','KOSM','AMTEX','HIRAT','NCML','CTM','HMIM',
'CWSM','RAVT','PIBTL','PICT','PNSC','ASL',
'DSL','ISL','CSAP','MUGHAL','DKL','ASTL','INIL']
cost_matrix = []
def data(symbols):
dates=pd.date_range(start_date,end_date)
df=pd.DataFrame(index=dates)
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/python_data/{}.csv'.format(str(symbol)),usecols=['Date','Close'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Close': symbol})
df=df.join(df_temp)
df=df.fillna(method='ffill')
df=df.fillna(method='bfill')
return df
def mat_alloc_auto(symbols):
n = len(symbols)
mat_alloc = np.zeros((n,n), dtype='float')
for i in range(0,n):
mat_alloc[i,i] = allocation / n
return mat_alloc
def compute_daily_returns(df):
"""Compute and return the daily return values."""
daily_returns=(df/df.shift(1))-1
df=df.fillna(value=0)
daily_returns=daily_returns[1:]
daily_returns = np.array(daily_returns)
return daily_returns
def port_eval(matrix_alloc,daily_return_matrix):
risk_free = 0
amount_matrix = [allocation]
return_mat = np.dot(daily_return_matrix,matrix_alloc)
return_mat = np.sum(return_mat, axis=1, keepdims=True)
return_mat = np.divide(return_mat,amount_matrix)
mat_average = np.mean(return_mat)
mat_std = np.std(return_mat, ddof=1)
sharpe_ratio = ((mat_average-risk_free)/mat_std) * math.sqrt(252)
return return_mat, sharpe_ratio, mat_average
def powerset(iterable):
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))
power_set = list(powerset(usesymbols))
len_power = len(power_set)
sharpe = []
for j in range(0, len_power):
df_01 = data(power_set[j])
matrix_allocation = mat_alloc_auto(power_set[j])
daily_return_mat = compute_daily_returns(df_01)
return_matrix, sharpe_ratio_val, matrix_average = port_eval(matrix_allocation, daily_return_mat)
sharpe.append(sharpe_ratio_val)
max_index, max_value = max(enumerate(sharpe), key=operator.itemgetter(1))
print('Maximum sharpe ratio occurs from ',power_set[max_index], ' value = ', max_value)
t1=t.time()
print('exec time is ', t1-t0, 'seconds')
The above code results in a sigkill error 9.
After research I understood that it is because process allocates too much memory putting pressures on OS.
So I tried running same code on HP Z600 workstation but it takes a lot of time plus the machine is freezes.
My question is how can I make my code more efficient to get instant results.

Related

Battery Storage Pyomo: optimize and iterate yearly data over a 365 hours time horizon

I have yearly data on electricity prices called 'HOEP'. With my pyomo model, I want to determine the behavior of a battery for the whole year but with a 365 hours time horizon (energy in = Ein and energy out = Eout). In other words, I want to make my algorithm run for the first 365 hours, then run again the next 365 hours time horizon with initial battery state equal to the last hour of the previous time horizon period.
I have tried dividing my yearly data into chunks (24 chunks of 365 hours in the year). With df_list = np.vsplit(dfa, 24), I create a list of chunks and transform them into 24 different dataframe. Then, I use for idx, df in enumerate([df0, df1, df2]), (here is only 3 chunks for testing) before my model to loop over the data. However, when I look at my results, it seems that the model only optimize for the last argument of enumerate([df0, df1, df2]) which is df2.
Does anybody know why it does not work for the 3 chunks? Or how could I do this in a different way?
Thank you in advance for your help!
Here is the edited version of my code that works now but I know it is porbably not the most pythonic way of doing this.
import numpy as np
import pandas as pd
from typing import List
from itertools import chain
from pyomo.environ import *
output = []
for idx, df in enumerate([df0,df1,df2]):
model = ConcreteModel()
# Variables of the model
model.T = Set(initialize=df.hour.tolist(), ordered=True)
model.Rmax = Param(initialize=1, within=Any)
model.Smax = Param(initialize=5, within=Any)
model.Dmax = Param(initialize=5, within=Any)
model.Ein = Var(model.T, domain=NonNegativeReals)
model.Eout = Var(model.T, domain=NonNegativeReals)
model.Z = Var(model.T, domain=NonNegativeReals)
model.L = Var(model.T, domain=NonNegativeReals)
model.NES = Var(model.T)
# Constraints
def storage_state(model, t):
if t == model.T.first():
return model.Z[t] == 0
else:
return (model.Z[t] == (model.Z[t-1] + (model.Ein[t]) - (model.Eout[t])))
model.charge_state = Constraint(model.T, rule=storage_state)
def discharge_constraint(model, t):
return model.Eout[t] <= model.Rmax
model.discharge = Constraint(model.T, rule=discharge_constraint)
def charge_constraint(model, t):
return model.Ein[t] <= model.Rmax
model.charge = Constraint(model.T, rule=charge_constraint)
def positive_charge(model, t):
return model.Eout[t] <= model.Z[t]
model.positive_charge = Constraint(model.T, rule=positive_charge)
def max_SOC(model, t):
return model.Z[t] <= model.Smax
model.max_SOC = Constraint(model.T, rule=max_SOC)
def demand_constraint(model, t):
return (model.L[t] == (df.loc[t, 'MktDemand'] + (model.Ein[t]) - (model.Eout[t])))
model.demand_constraint = Constraint(model.T, rule=demand_constraint)
def discharge_limit(model, t):
max_t = model.T.last()
if t < max_t - 24:
return sum(model.Eout[i] for i in range(t, t+24)) <= model.Dmax
else:
return Constraint.Skip
model.limit_disch_out = Constraint(model.T, rule=discharge_limit)
def charge_limit(model, t):
max_t = model.T.last()
if t < max_t - 24:
return sum(model.Ein[i] for i in range(t, t+24)) <= model.Dmax
else:
return Constraint.Skip
model.limit_charg_out = Constraint(model.T, rule=charge_limit)
def Net_energy_sold(model, t):
return model.NES[t] == ((model.Eout[t] - model.Ein[t]) / model.Rmax * 100)
model.net_energy = Constraint(model.T, rule=Net_energy_sold)
# Objective function and optimization
income = sum(df.loc[t,'HOEP'] * model.Eout[t] for t in model.T)
expenses = sum(df.loc[t,'HOEP'] * model.Ein[t] for t in model.T)
profits = (income - expenses)
model.objective = Objective(expr=profits, sense=maximize)
# Solve model
solver = SolverFactory('glpk')
solver.solve(model)
# Extract model output in list
Date = list(df['Date'])
output.append([Date, model.Ein.get_values().values(), model.Eout.get_values().values(),
model.Z.get_values().values(), model.NES.get_values().values(),
model.L.get_values().values()])
df_results = pd.DataFrame(output)
df_results.rename(columns = {0: 'Date', 1: 'Ein', 2:'Eout', 3:'Z', 4:'NES', 5:'Load'}, inplace = True)
df_results
# Present final results in dataframe
d = ein = eout = z = l = nes = []
for i in list(df_results.index):
d = d + list(df_results.loc[i,'Date'])
ein = ein + list(df_results.loc[i,'Ein'])
eout = eout + list(df_results.loc[i,'Eout'])
z = z + list(df_results.loc[i,'Z'])
nes = nes + list(df_results.loc[i,'NES'])
l = l + list(df_results.loc[i,'Load'])
results = pd.DataFrame(zip(d, ein, eout, z, nes, l), columns = ['Date','Ein','Eout','SOC','NES','Load'])
results
# Returned dataframe
Date Ein Eout SOC NES Load
0 2019-01-01 0.0 0.00 0.00 0.0 16231.00
1 2019-01-01 0.0 0.00 0.00 0.0 16051.00
2 2019-01-01 1.0 0.00 1.00 -100.0 15806.00
3 2019-01-01 1.0 0.00 2.00 -100.0 15581.00
...
Why it isn't working
(disclaimer: this is one issue I see, there might be others).
At each iteration of the for loop, list_of_series is defined from scratch, so all the results obtained in previous iterations are lost.
I'd also check that df.hour is "hour of the year" or "hour from beginning of data" rather than "hour of the day" (if it's the latter, this will also cause an error).
Fixing the problem
(there are several solutions, obviously) at each iteration of the for loop, turn list_of_series into a pd.DataFrame, and append the dataframe to a list.
At the end of the for loop (once you have run the model on each chunk of data), concatenate the list of dataframes.
from typing import List
...
# find a better name, variable names shouldn't specify the type
list_of_dataframes: List[pd.DataFrame] = []
for ...: # for each chunk of data
... # create model, solve
list_of_series = ...
list_of_dataframes.append(pd.DataFrame(list_of_series))
results = pd.concat(list_of_dataframes, axis=0) # use `ignore_index=True` if needed
A few tips
Break your code into functions. Create a function which defines the model. That highlights what inputs and outputs are, makes the for loop more readable, allows you to use it in other contexts and potentially to test it.
(opinionated) set your "data" as parameters of the model, instead of using them directly to construct constraints and the objective function. This allows you to have a single place where each piece of data is ingested in the model, creates internal consistency in the model and allows you to extract results purely based on the optimized model.
separate I/O (reading/writing to file) from the rest of the code. If your data source changes format or filetype, you'll be able to change that without changing any of the rest of the code.
def main(input_data: pd.DataFrame) -> pd.DataFrame:
# group by week, month, or any applicable resolution
# this assumes the index is a `pd.DatetimeIndex`
# `MS` is "Month Start" - watch out with weeks because `freq="w"` starts
# on Mondays, and your data might start on a different weekday.
# If you want split into chunks of some number of days,
# use e.g. `freq="14d"`
grouped = df.groupby(pd.Grouper(freq="MS"))
results_list: List[pd.DataFrame] = []
for month, group in grouped:
model = create_model(group)
optimization_results = SolverFactory('glpk').solve(model)
results_list.append(extract_results(model)) # pass `group` if needed
results = pd.concat(results_list, axis=0, ignore_index=True)
return results
def create_model(df: pd.DataFrame) -> ConcreteModel:
# NOTE: instead of hard-coding parameters such as battery capacity,
# pass them as inputs to the function.
model = ConcreteModel()
...
return model
def extract_results(model: ConcreteModel) -> pd.DataFrame:
...
def load_data(filename) -> pd.DataFrame:
...
if __name__ == "__main__":
input_data = load_data(...)
results = main(input_data)
results.to_csv(...)

How can I make Matrix Multiplication Code run in parallel?

I have a block of code that takes user input and creates them into matrices but i need them to run in parallel as appose to running the code as one single process. I have no clue how to do so, i started reading into NumPy but haven't really grasped it.
import timeit
start = timeit.default_timer()
def getMatrix(name):
matrixCreated = []
i = 0
while True:
i += 1
row = input('\nEnter elements in row %s of Matrix %s (separated by commas)\nOr -1 to exit: ' % (i, name))
if row == '-1':
break
else:
strList = row.split(',')
matrixCreated.append(list(map(int, strList)))
return matrixCreated
def getColAsList(matrixToManipulate, col):
myList = []
numOfRows = len(matrixToManipulate)
for i in range(numOfRows):
myList.append(matrixToManipulate[i][col])
return myList
def getCell(matrixA, matrixB, r, c):
matrixBCol = getColAsList(matrixB, c)
lenOfList = len(matrixBCol)
productList = [matrixA[r][i] * matrixBCol[i] for i in range(lenOfList)]
return sum(productList)
matrixA = getMatrix('A')
matrixB = getMatrix('B')
rowA = len(matrixA)
colA = len(matrixA[0])
rowB = len(matrixB)
colB = len(matrixB[0])
result = [[0 for p in range(colB)] for q in range(rowA)]
if (colA != rowB):
print('The two matrices cannot be multiplied')
else:
print('\nThe result is')
for i in range(rowA):
for j in range(colB):
result[i][j] = getCell(matrixA, matrixB, i, j)
print(result[i])
stop = timeit.default_timer()
print('Time: ', stop - start)
I also have a timer on the code to pint the time taken, but as its a programme that takes user input it is directly related to how long it takes to process in real-time, is there a way i can time it just to execute? I need to compare how making this code run in parallel can decrease run-time.
numpy is an efficient C implementation, while jax is a an efficient parallel implementation that supports also GPU/TPU.
Both of then would run faster than your current python implementation.
Import numpy or jax
import numpy as np
or
import jax.numpy as np
Then create the matrices
A = np.array(getMatrix('A'))
B = np.array(getMatrix('B'))
And output the matrix multiplication
C = A # B
print (C)
If you want only the time of execution of the matrix multiplication than move the start after the user input, like this:
...
matrixA = getMatrix('A')
matrixB = getMatrix('B')
start = timeit.default_timer()
rowA = len(matrixA)
colA = len(matrixA[0])
rowB = len(matrixB)
colB = len(matrixB[0])
...
stop = timeit.default_timer()
print('Time: ', stop - start)

How to iterate over Dataframe using a "sliding window" with multi processing (used for genetic algo)

Explanation of what I'm trying to accomplish:
I have dataframe to iterate over looking for some condition given a variable.
I have list of variables and I iterate over this df using multiprocessing. I pop(0) everytime a process start.
Now I need to add one more level, but I can't understand how to do it.
Here is the code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import decimal
import multiprocessing
from multiprocessing import Pool, Manager
import itertools
#dataframe
columns = ['A', 'B', 'C', 'D']
data = np.array([np.random.randint(1, 10_000, size=750)]*4).T
df = pd.DataFrame(data, columns= columns)
print(df)
# Creating a list of tuples to apply a given function
a = np.arange(5,20, 1)
b = np.arange(1.01, 1.10, 0.01)
d = np.arange(0.95, 0.99, 0.01)
c = list(itertools.product(a, b, d))
list_of_tuples = []
dic = {}
for x in c:
dic[(x)] = x
for key, value in dic.items():
uno, due, tre = value[0], value[1], value[2]
list_of_tuples.append((uno, due, tre))
print(len(dic)) #checking size of dictionary
print(len(list_of_tuples), len(df)) #checking if size match
maximum = max(dic, key=dic.get) #maximum key inside dictionary
print(maximum, dic[maximum])
new_dic = {}
i = 1
#look_back_period = (len(df) // 10)
#print(look_back_period)
c = 0
"""chunks is the only way where I could use pool.map, it should be a list of list"""
chunks = [list_of_tuples[i::len(list_of_tuples)] for i in range(len(list_of_tuples))]
print(len(chunks[0]))
#this manager is needed to have every process append to the same Dict the result of the
# function that is given below
manager = Manager()
new_dic = manager.dict()
def multi_prova(list_of_tuples):
list_results = []
given1, given2, given3 = list_of_tuples.pop(0)
#sliding_window = df.iloc[0 : c + look_back_period, : ]
for row in df.itertuples():
result = (given1 / row.A).round(2)
list_results.append(result)
new_dic[str(given1)+', ' + str(given2)+', ' + str(given2)] = result
time1 = time.time()
if __name__ == "__main__":
try:
pool = Pool() # Make the Pool of workers
results = pool.map(multi_prova, chunks) #Open the urls in their own threads
pool.close() #close the pool and wait for the work to finish
pool.join()
except:
print('error')
time2 = time.time()
print(time2 - time1)
#On my original code len(new_dic) matched len(dic), here is 750 vs 150, don't know why?!?!?!
print(new_dic)
print(len(new_dic))
Shouldn't be the len of new_dic == dic
750 rows, and a result for every row to 'append' to the dictionary.
So the problem are two:
why (len(new_dic)) is not 750.
And on top of that I would like to have, a sliding window to iter a slice of dataframe and have a dictionary of list of list with all the result of every slice of the df while c + look_back_period < len(df).
Hope I was clear enough.
A big hug on anyone that can contribute.

Python parallelised correlation slower than single process correlation

I wanted to parallelize df.corr() using multiprocessing module in Python. I'm taking one column and computing correlation values with rest all columns in one process and second column with rest other columns in another process. I'm continuing in this fashion to fill the upper traingle of correlation matrix by stacking up the result rows from all the processes.
I took sample data of shape (678461, 210) and tried my parallelized method and df.corr() and got running time of 214.40s and 42.64s respectively. So, my parallelized method is taking more time.
Is there a way to improve this?
import multiprocessing as mp
import pandas as pd
import numpy as np
from time import *
def _correlation(args):
i, mat, mask = args
ac = mat[i]
arr = []
for j in range(len(mat)):
if i > j:
continue
bc = mat[j]
valid = mask[i] & mask[j]
if valid.sum() < 1:
c = NA
elif i == j:
c = 1.
elif not valid.all():
c = np.corrcoef(ac[valid], bc[valid])[0, 1]
else:
c = np.corrcoef(ac, bc)[0, 1]
arr.append((j, c))
return arr
def correlation_multi(df):
numeric_df = df._get_numeric_data()
cols = numeric_df.columns
mat = numeric_df.values
mat = pd.core.common._ensure_float64(mat).T
K = len(cols)
correl = np.empty((K, K), dtype=float)
mask = np.isfinite(mat)
pool = mp.Pool(processes=4)
ret_list = pool.map(_correlation, [(i, mat, mask) for i in range(len(mat))])
for i, arr in enumerate(ret_list):
for l in arr:
j = l[0]
c = l[1]
correl[i, j] = c
correl[j, i] = c
return pd.DataFrame(correl, index = cols, columns = cols)
if __name__ == '__main__':
noise = pd.DataFrame(np.random.randint(0,100,size=(100000, 50)))
noise2 = pd.DataFrame(np.random.randint(100,200,size=(100000, 50)))
df = pd.concat([noise, noise2], axis=1)
#Single process correlation
start = time()
s = df.corr()
print('Time taken: ',time()-start)
#Multi process correlation
start = time()
s1 = correlation_multi(df)
print('Time taken: ',time()-start)
The results from _correlation have to be moved from the worker processes to the process running the Pool via interprocess communication.
This means that the return data is pickled, sent to the other process, unpickled and added to the result list.
This takes time and is by nature a sequential process.
And map processes the returns in the order they were sent, IIRC. So if one iteration takes relatively long, other results might be stuck waiting. You could try using imap_unordered which yields results as soon as they arrive.

financial python library that has xirr and xnpv function?

numpy has irr and npv function, but I need xirr and xnpv function.
this link points out that xirr and xnpv will be coming soon.
http://www.projectdirigible.com/documentation/spreadsheet-functions.html#coming-soon
Is there any python library that has those two functions? tks.
Here is one way to implement the two functions.
import scipy.optimize
def xnpv(rate, values, dates):
'''Equivalent of Excel's XNPV function.
>>> from datetime import date
>>> dates = [date(2010, 12, 29), date(2012, 1, 25), date(2012, 3, 8)]
>>> values = [-10000, 20, 10100]
>>> xnpv(0.1, values, dates)
-966.4345...
'''
if rate <= -1.0:
return float('inf')
d0 = dates[0] # or min(dates)
return sum([ vi / (1.0 + rate)**((di - d0).days / 365.0) for vi, di in zip(values, dates)])
def xirr(values, dates):
'''Equivalent of Excel's XIRR function.
>>> from datetime import date
>>> dates = [date(2010, 12, 29), date(2012, 1, 25), date(2012, 3, 8)]
>>> values = [-10000, 20, 10100]
>>> xirr(values, dates)
0.0100612...
'''
try:
return scipy.optimize.newton(lambda r: xnpv(r, values, dates), 0.0)
except RuntimeError: # Failed to converge?
return scipy.optimize.brentq(lambda r: xnpv(r, values, dates), -1.0, 1e10)
With the help of various implementations I found in the net, I came up with a python implementation:
def xirr(transactions):
years = [(ta[0] - transactions[0][0]).days / 365.0 for ta in transactions]
residual = 1
step = 0.05
guess = 0.05
epsilon = 0.0001
limit = 10000
while abs(residual) > epsilon and limit > 0:
limit -= 1
residual = 0.0
for i, ta in enumerate(transactions):
residual += ta[1] / pow(guess, years[i])
if abs(residual) > epsilon:
if residual > 0:
guess += step
else:
guess -= step
step /= 2.0
return guess-1
from datetime import date
tas = [ (date(2010, 12, 29), -10000),
(date(2012, 1, 25), 20),
(date(2012, 3, 8), 10100)]
print xirr(tas) #0.0100612640381
Created a package for fast XIRR calculation, PyXIRR
It doesn't have external dependencies and works faster than any existing implementation.
from datetime import date
from pyxirr import xirr
dates = [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)]
amounts = [-1000, 1000, 1000]
# feed columnar data
xirr(dates, amounts)
# feed tuples
xirr(zip(dates, amounts))
# feed DataFrame
import pandas as pd
xirr(pd.DataFrame({"dates": dates, "amounts": amounts}))
This answer is an improvement on #uuazed's answer and derives from that. However, there are a few changes:
It uses a pandas dataframe instead of a list of tuples
It is cashflow direction agnostic, i.e., whether you treat inflows as negative and outflows as positive or vice versa, the result will be the same, as long as the treatment is consistent for all transactions.
XIRR calculation with this method doesn't work if cashflows are not ordered by date. Hence I have handled sorting of the dataframe internally.
In the earlier answer, there was an implicit assumption that XIRR will mostly be positive. which created the problem pointed out in the other comment, that XIRR between -100% and -95% cannot be calculated. This solution does away with that problem.
import pandas as pd
import numpy as np
def xirr(df, guess=0.05, date_column = 'date', amount_column = 'amount'):
'''Calculates XIRR from a series of cashflows.
Needs a dataframe with columns date and amount, customisable through parameters.
Requires Pandas, NumPy libraries'''
df = df.sort_values(by=date_column).reset_index(drop=True)
df['years'] = df[date_column].apply(lambda x: (x-df[date_column][0]).days/365)
step = 0.05
epsilon = 0.0001
limit = 1000
residual = 1
#Test for direction of cashflows
disc_val_1 = df[[amount_column, 'years']].apply(
lambda x: x[amount_column]/((1+guess)**x['years']), axis=1).sum()
disc_val_2 = df[[amount_column, 'years']].apply(
lambda x: x[amount_column]/((1.05+guess)**x['years']), axis=1).sum()
mul = 1 if disc_val_2 < disc_val_1 else -1
#Calculate XIRR
for i in range(limit):
prev_residual = residual
df['disc_val'] = df[[amount_column, 'years']].apply(
lambda x: x[amount_column]/((1+guess)**x['years']), axis=1)
residual = df['disc_val'].sum()
if abs(residual) > epsilon:
if np.sign(residual) != np.sign(prev_residual):
step /= 2
guess = guess + step * np.sign(residual) * mul
else:
return guess
Explanation:
In the test block, it checks whether increasing the discounting rate increases the discounted value or reduces it. Based on this test, it is determined which direction the guess should move. This block makes the function handle cashflows regardless of direction assumed by the user.
The np.sign(residual) != np.sign(prev_residual) checks when the guess has increased/decreased beyond the required XIRR rate, because that's when the residual goes from negative to positive or vice versa. The step size is reduced at this point.
The numpy package is not absolutely necessary. without numpy, np.sign(residual) can be replaced with residual/abs(residual). I have used numpy to make the code more readable and intuitive
I have tried to test this code with a variety of cash flows. If you find any cases which are not handled by this function, do let me know.
Edit: Here's a cleaner and faster version of the code using numpy arrays. In my test with about 700 transaction, this code ran 5 times faster than the one above:
def xirr(df, guess=0.05, date_column='date', amount_column='amount'):
'''Calculates XIRR from a series of cashflows.
Needs a dataframe with columns date and amount, customisable through parameters.
Requires Pandas, NumPy libraries'''
df = df.sort_values(by=date_column).reset_index(drop=True)
amounts = df[amount_column].values
dates = df[date_column].values
years = np.array(dates-dates[0], dtype='timedelta64[D]').astype(int)/365
step = 0.05
epsilon = 0.0001
limit = 1000
residual = 1
#Test for direction of cashflows
disc_val_1 = np.sum(amounts/((1+guess)**years))
disc_val_2 = np.sum(amounts/((1.05+guess)**years))
mul = 1 if disc_val_2 < disc_val_1 else -1
#Calculate XIRR
for i in range(limit):
prev_residual = residual
residual = np.sum(amounts/((1+guess)**years))
if abs(residual) > epsilon:
if np.sign(residual) != np.sign(prev_residual):
step /= 2
guess = guess + step * np.sign(residual) * mul
else:
return guess
I started from #KT 's solution but improved on it in a few ways:
as pointed out by others, there is no need for xnpv to return inf if the discount rate <= -100%
if the cashflows are all positive or all negative, we can return a nan straight away: no point in letting the algorithm search forever for a solution which doesn't exist
I have made the daycount convention an input; sometimes it is 365, some other times it is 360 - it depends on the case. I have not modelled 30/360. More details on Matlab's docs
I have added optional inputs for the maximum number of iterations and for the starting point of the algorithm
I have not changed the default tolerance of the algorithms but that's very easy to change
Key findings for the specific example below (results may well be different for other cases, I have not had the time to test many other cases):
starting from a value = -sum(all cashflows) / sum(negative cashflows) slows the algorithms a little bit (by 7-10%)
scipi's netwon is faster than scipy's fsolve
Execution time with newton vs fsolve:
import numpy as np
import pandas as pd
import scipy
import scipy.optimize
from datetime import date
import timeit
def xnpv(rate, values, dates , daycount = 365):
daycount = float(daycount)
# Why would you want to return inf if the rate <= -100%? I removed it, I don't see how it makes sense
# if rate <= -1.0:
# return float('inf')
d0 = dates[0] # or min(dates)
# NB: this xnpv implementation discounts the first value LIKE EXCEL
# numpy's npv does NOT, it only starts discounting from the 2nd
return sum([ vi / (1.0 + rate)**((di - d0).days / daycount) for vi, di in zip(values, dates)])
def find_guess(cf):
whereneg = np.where(cf < 0)
sumneg = np.sum( cf[whereneg] )
return -np.sum(cf) / sumneg
def xirr_fsolve(values, dates, daycount = 365, guess = 0, maxiters = 1000):
cf = np.array(values)
if np.where(cf <0,1,0).sum() ==0 | np.where(cf>0,1,0).sum() == 0:
#if the cashflows are all positive or all negative, no point letting the algorithm
#search forever for a solution which doesn't exist
return np.nan
result = scipy.optimize.fsolve(lambda r: xnpv(r, values, dates, daycount), x0 = guess , maxfev = maxiters, full_output = True )
if result[2]==1: #ie if the solution converged; if it didn't, result[0] will be the last iteration, which won't be a solution
return result[0][0]
else:
#consider rasiing a warning
return np.nan
def xirr_newton(values, dates, daycount = 365, guess = 0, maxiters = 1000, a = -100, b =1e5):
# a and b: lower and upper bound for the brentq algorithm
cf = np.array(values)
if np.where(cf <0,1,0).sum() ==0 | np.where(cf>0,1,0).sum() == 0:
#if the cashflows are all positive or all negative, no point letting the algorithm
#search forever for a solution which doesn't exist
return np.nan
res_newton = scipy.optimize.newton(lambda r: xnpv(r, values, dates, daycount), x0 = guess, maxiter = maxiters, full_output = True)
if res_newton[1].converged == True:
out = res_newton[0]
else:
res_b = scipy.optimize.brentq(lambda r: xnpv(r, values, dates, daycount), a = a , b = b, maxiter = maxiters, full_output = True)
if res_b[1].converged == True:
out = res_b[0]
else:
out = np.nan
return out
# let's compare how long each takes
d0 = pd.to_datetime(date(2010,1,1))
# an investment in which we pay 100 in the first month, then get 2 each month for the next 59 months
df = pd.DataFrame()
df['month'] = np.arange(0,60)
df['dates'] = df.apply( lambda x: d0 + pd.DateOffset(months = x['month']) , axis = 1 )
df['cf'] = 0
df.iloc[0,2] = -100
df.iloc[1:,2] = 2
r = 100
n = 5
t_newton_no_guess = timeit.Timer ("xirr_newton(df['cf'], df['dates'], guess = find_guess(df['cf'].to_numpy() ) ) ", globals = globals() ).repeat(repeat = r, number = n)
t_fsolve_no_guess = timeit.Timer ("xirr_fsolve(df['cf'], df['dates'], guess = find_guess(df['cf'].to_numpy() ) )", globals = globals() ).repeat(repeat = r, number = n)
t_newton_guess_0 = timeit.Timer ("xirr_newton(df['cf'], df['dates'] , guess =0.) ", globals = globals() ).repeat(repeat = r, number = n)
t_fsolve_guess_0 = timeit.Timer ("xirr_fsolve(df['cf'], df['dates'], guess =0.) ", globals = globals() ).repeat(repeat = r, number = n)
resdf = pd.DataFrame(index = ['min time'])
resdf['newton no guess'] = [min(t_newton_no_guess)]
resdf['fsolve no guess'] = [min(t_fsolve_no_guess)]
resdf['newton guess 0'] = [min(t_newton_guess_0)]
resdf['fsolve guess 0'] = [min(t_fsolve_guess_0)]
# the docs explain why we should take the min and not the avg
resdf = resdf.transpose()
resdf['% diff vs fastest'] = (resdf / resdf.min() -1) * 100
Conclusions
I noticed there were some cases in which newton and brentq didn't converge, but fsolve did, so I modified the function so that, in order, it starts with newton, then brentq, then, lastly, fsolve.
I haven't actually found a case in which brentq was used to find a solution. I'd be curious to understand when it would work, otherwise it's probably best to just remove it.
I went back to try/except because I noticed the code above wasn't identifying all the cases of non-convergence. That's something I'd like to look into when I have a bit more time
This is my final code:
def xirr(values, dates, daycount = 365, guess = 0, maxiters = 10000, a = -100, b =1e10):
# a and b: lower and upper bound for the brentq algorithm
cf = np.array(values)
if np.where(cf <0,1,0).sum() ==0 | np.where(cf >0,1,0).sum() == 0:
#if the cashflows are all positive or all negative, no point letting the algorithm
#search forever for a solution which doesn't exist
return np.nan
try:
output = scipy.optimize.newton(lambda r: xnpv(r, values, dates, daycount),
x0 = guess, maxiter = maxiters, full_output = True, disp = True)[0]
except RuntimeError:
try:
output = scipy.optimize.brentq(lambda r: xnpv(r, values, dates, daycount),
a = a , b = b, maxiter = maxiters, full_output = True, disp = True)[0]
except:
result = scipy.optimize.fsolve(lambda r: xnpv(r, values, dates, daycount),
x0 = guess , maxfev = maxiters, full_output = True )
if result[2]==1: #ie if the solution converged; if it didn't, result[0] will be the last iteration, which won't be a solution
output = result[0][0]
else:
output = np.nan
return output
Tests
These are some tests I have put together with pytest
import pytest
import numpy as np
import pandas as pd
import whatever_the_file_name_was as finc
from datetime import date
def test_xirr():
dates = [date(2010, 12, 29), date(2012, 1, 25), date(2012, 3, 8)]
values = [-10000, 20, 10100]
assert pytest.approx( finc.xirr(values, dates) ) == 1.006127e-2
dates = [date(2010, 1,1,), date(2010,12,27)]
values = [-100,110]
assert pytest.approx( finc.xirr(values, dates, daycount = 360) ) == 0.1
values = [100,-110]
assert pytest.approx( finc.xirr(values, dates, daycount = 360) ) == 0.1
values = [-100,90]
assert pytest.approx( finc.xirr(values, dates, daycount = 360) ) == -0.1
# test numpy arrays
values = np.array([-100,0,121])
dates = [date(2010, 1,1,), date(2011,1,1), date(2012,1,1)]
assert pytest.approx( finc.xirr(values, dates, daycount = 365) ) == 0.1
# with a pandas df
df = pd.DataFrame()
df['values'] = values
df['dates'] = dates
assert pytest.approx( finc.xirr(df['values'], df['dates'], daycount = 365) ) == 0.1
# with a pands df and datetypes
df['dates'] = pd.to_datetime(dates)
assert pytest.approx( finc.xirr(df['values'], df['dates'], daycount = 365) ) == 0.1
# now for some unrealistic values
df['values'] =[-100,5000,0]
assert pytest.approx( finc.xirr(df['values'], df['dates'], daycount = 365) ) == 49
df['values'] =[-1e3,0,1]
rate = finc.xirr(df['values'], df['dates'], daycount = 365)
npv = finc.xnpv(rate, df['values'], df['dates'])
# this is an extreme case; as long as the corresponsing NPV is between these values it's not a bad result
assertion = ( npv < 0.1 and npv > -.1)
assert assertion == True
P.S. Important difference between this xnpv and numpy.npv
This is not, strictly speaking, relevant to this answer, but useful to know for whoever runs financial calculations with numpy:
numpy.npv doesn't discount the first item of cashflow - it starts from the second, e.g.
np.npv(0.1,[110,0]) = 110
and
np.npv(0.1,[0,110] = 100
Excel, however, discounts from the very first item:
NPV(0.1,[110,0]) = 100
Numpy's financial functions will be deprecated and replaced with those of numpy_financial, which however will likely continue to behave the same, if only for backward compatibility.
Created a python package finance-calulator which can be used for xirr calculation. underlying, it uses newton's method.
Also I did some time profiling and it is little better than the scipy's xnpv method as suggested in #KT.'s answer.
Here's the implementation.
With Pandas, I got the following to work:
(note, I'm using ACT/365 convention)
rate = 0.10
dates= pandas.date_range(start=pandas.Timestamp('2015-01-01'),periods=5, freq="AS")
cfs = pandas.Series([-500,200,200,200,200],index=dates)
# intermediate calculations( if interested)
# cf_xnpv_days = [(cf.index[i]-cf.index[i-1]).days for i in range(1,len(cf.index))]
# cf_xnpv_days_cumulative = [(cf.index[i]-cf.index[0]).days for i in range(1,len(cf.index))]
# cf_xnpv_days_disc_factors = [(1+rate)**(float((cf.index[i]-cf.index[0]).days)/365.0)-1 for i in range(1,len(cf.index))]
cf_xnpv_days_pvs = [cf[i]/float(1+(1+rate)**(float((cf.index[i]-cf.index[0]).days)/365.0)-1) for i in range(1,len(cf.index))]
cf_xnpv = cf[0]+ sum(cf_xnpv_days_pvs)
def xirr(cashflows,transactions,guess=0.1):
#function to calculate internal rate of return.
#cashflow: list of tuple of date,transactions
#transactions: list of transactions
try:
return optimize.newton(lambda r: xnpv(r,cashflows),guess)
except RuntimeError:
positives = [x if x > 0 else 0 for x in transactions]
negatives = [x if x < 0 else 0 for x in transactions]
return_guess = (sum(positives) + sum(negatives)) / (-sum(negatives))
return optimize.newton(lambda r: xnpv(r,cashflows),return_guess)

Categories

Resources