Efficient summation without looping - python

I need to calculate entries for two long series consisting of sums. Currently I'm looping, which is rather inefficient. I was wondering whether there exists a pandas method to exploit. It follows an MWE.
import numpy as np
import pandas as pd
T = range(1, 8761) # year in hours
T0 = range(6, 8761 + 24, 24) # time instant such that storage is empty
e2p = 2 # energy-to-power; maximal duration [h] such that storage can be emptied
# Input data:
# on: maximal storageinflow
# off: maximal storageoutflow
df = pd.DataFrame(
np.random.randint(0, 100, size=(8760, 2)),
index=T,
columns=['on', 'off']
)
# Output
# sum1: maximal storage level for storage with fixed instants T0
# sum2: maximal storage level for e2p storage
df['sum1'] = np.nan
df['sum2'] = np.nan
# Summation 1
# Everything that flows in within s in [t, t + 23] for t in T0
# must flow out until s = t + 24
for t in T0:
t0 = max(1, t - 23)
t = min(t, 8760)
for s in range(t0, t + 1):
sum_on = df.loc[t0: s, 'on'].sum()
sum_off = df.loc[s + 1: t, 'off'].sum()
df.loc[s, 'sum1'] = min(sum_on, sum_off)
# Summation 2
# Everything that flowed in until t in T, *can* flow out until t+e2p
for t in T:
t0 = max(1, t - e2p + 1)
tf = min(t + e2p, 8760)
sum_on = df.loc[t0: t, 'on'].sum()
sum_off = df.loc[t + 1: tf, 'off'].sum()
df.loc[t, 'sum2'] = min(sum_on, sum_off)
Solution Sum 1
for t in T0:
t0 = max(1, t - VZF + 1)
t = min(t, 8760)
sum_ein = df.loc[t0: t, 'on'].cumsum()
sum_aus = (df.loc[t0: t, 'off']
.iloc[::-1]
.cumsum()
.iloc[::-1]
.shift(periods=-1, fill_value=0))
df.loc[t0: t, 'sum1'] = pd.concat([sum_on, sum_off], axis=1).min(axis=1)
Solution Sum 2
sum_on = df['on'].rolling(window=e2p, min_periods=1).sum()
sum_off = (df['off']
.shift(periods=-1, fill_value=0)
.iloc[::-1]
.rolling(window=e2p, min_periods=1)
.sum()
.iloc[::-1])
df['sum2'] = pd.concat([sum_on, sum_off], axis=1).min(axis=1)

Related

Unable to reproduce Matlab-based results in Python

I've tried to implement a Matlab script by Lindner (2012) in Python. However, the final result D in my Python script diverges from the results which I am able to generate in an online Matlab environment (see pictures below). I ran rand('twister', 1337) in both scripts to make random numbers predictable.
Up until the last step Gram-Schmidt algorithm everything appears to work correctly (the variables' values are the same as far as I can see). However, D is different. Can anyone spot my mistake?
Lindner, Sören, Julien Legault, and Dabo Guan. 2012.
‘Disaggregating Input–Output Models with Incomplete Information’.
Economic Systems Research 24 (4): 329–47.
https://doi.org/10.1080/09535314.2012.689954.
The Matlab script is available via: https://www.tandfonline.com/doi/suppl/10.1080/09535314.2012.689954
Matlab output (first rows and cols) - authoritative:
Diverging Python output (first rows and cols):
"""Implementation of Lindner (2012) in Python with NumPy and Pandas.
Lindner, Sören, Julien Legault, and Dabo Guan. 2012.
‘Disaggregating Input–Output Models with Incomplete Information’.
Economic Systems Research 24 (4): 329–47.
https://doi.org/10.1080/09535314.2012.689954.
The comments in this script contain the Matlab code given in the supplementary
material 'cesr_a_689954_sup_27358897.docx' of Lindner (2012).
Source (accessed 06.12.2022):
https://www.tandfonline.com/doi/suppl/10.1080/09535314.2012.689954
The script contains one aspect of randomness. A random vector is
generated in line 90 of the Matlab script: `base(p,:) = rand(1,Nv)`. For verification purposes, `np.random.seed(1337)` (Python) and `rand('twister', 1337)` (Matlab) was applied.
"""
import numpy as np
import pandas as pd
from tqdm import tqdm
if True:
# Switch flag for verification
# Matlab equivalent: `rand('twister', 1337)`
# Source: https://stackoverflow.com/a/20202330/5696601
np.random.seed(1337)
# %% Loading data
# load('IOT_China.mat'); %Loading China's IO table
flows = pd.read_csv(
# Input–output table of China (2007), in billion RMB
'io-table-cn-2007-flows.csv',
header=None
)
flows_idx = pd.read_csv(
'io-table-cn-2007-flows-idx.csv'
)
flows.columns = pd.MultiIndex.from_frame(flows_idx)
flows.index = pd.MultiIndex.from_frame(flows_idx.iloc[:12, :])
# f = IOT_national(:,end-1); %Vector of final demand
f = flows.loc[:, ('Final demand', 'FD')]
# id = IOT_national(:,end-2); %Vector of intermediate demand
id = flows.loc[:, ('Intermediate demand', 'ID')]
# x = IOT_national(:,end); %Vector of total outputs
x = f + id
# Z = IOT_national(:,1:end-3); %Exchange matrix
Z = flows.loc[
# Rows
:,
# Cols
(~flows.columns.get_level_values('Cat')
.isin(['ID', 'FD', 'TO']))
]
del flows_idx
# temp = size(Z); %Size of IO table
temp = Z.shape
# N = temp(1)-1; %Number of common sectors
N = temp[0] - 1
# A = Z./repmat(transpose(x),N+1,1); %Aggregated technical coefficient matrix
A = np.divide(Z, x)
# x_common = x(1:end-1); %Vector of total outputs for common sectors
x_common = x[:-1]
# f_common = f(1:end-1); %Vector of final demand for common sectors
f_common = f[:-1]
# Note: The last sector of the table is disaggregated,
# i.e. the electricity sector
# x_elec = x(end); %Total output of the disaggregated sector
x_elec = x[-1]
# f_elec = f(end); %Final demand of the disaggregated sector
f_elec = f[-1]
# %% Newly formed sectors from the electricity sector
# n = 3; %Number of new sectors
# w = [0.241;0.648;0.111]; %New sector weights
w = pd.read_csv(
'io-table-cn-2007-w.csv',
header=None
)
w = w.values.flatten()
w_idx = pd.read_csv(
'io-table-cn-2007-w-idx.csv'
)
n = len(w)
# N_tot = N + n; %Total number of sectors for the disaggregated IO table
N_tot = N + n
# x_new = w.*x_elec; %Vector of new total sector outputs
x_new = w*x_elec/1000
# xs = [x_common;x_new]; %Vector of disaggregated economy sector total outputs
xs = np.concatenate((x_common, x_new))
# f_new = w*f_elec; %Final demand of new sectors
f_new = w*f_elec
# %% Building the constraint matrix C
# Nv = n*N_tot + n; %Number of variables
Nv = n * N_tot + n
# Nc = N + n + 1; %Number of constraints
Nc = N + n + 1
# q = [transpose(A(N+1,:));w]; %Vector of constraint constants
q = pd.concat(
[A.iloc[N, :],
pd.Series(w, index=pd.MultiIndex.from_frame(w_idx))]
)
# C = zeros(Nc,Nv); %Matrix of constraints
C = np.zeros((Nc, Nv))
# %% Common sectors constraints
# C11 = zeros(N,N*n);
# for ii = 1:N
# col_indices = n*(ii-1)+1:n*ii;
# C11(ii,col_indices) = ones(1,n);
# end
# C(1:N,1:N*n) = C11;
C11 = np.zeros((N, N*n))
for ii in range(N):
col_indices = range(n*(ii), n*ii+n)
C11[ii, col_indices] = np.ones((1, n))
C[:N, :N*n] = C11
# %% New sectors constraints
# C22 = zeros(1,n^2);
# for ii = 1:n
# col_indices = n*(ii-1)+1:n*ii;
# C22(1,col_indices) = w(ii)*ones(1,n);
# end
# C(N+1,N*n+1:N*n+n^2) = C22;
C22 = np.zeros((1, n**2))
for ii in range(0, n):
col_indices = range(n*(ii), n*ii+n)
C22[0, col_indices] = w[ii]*np.ones((1, n))
C[N, N*n:N*n+n**2] = C22
# %% Final demand constraints
# C31 = zeros(n,N*n);
# for ii = 1:N
# col_indices = n*(ii-1)+1:n*ii;
# C31(1:n,col_indices) = (x_common(ii)/x_elec)*eye(n,n);
# end
# C32 = zeros(n,n^2);
# for ii = 1:n
# col_indices = n*(ii-1)+1:n*ii;
# C32(1:n,col_indices) = w(ii)*eye(n,n);
# end
# C(N+2:end,1:N*n) = C31;
# C(N+2:end,N*n+1:N*n+n^2) = C32;
# C(N+2:end,N*n+n^2+1:end) = eye(n,n);
C31 = np.zeros((n, N*n))
for ii in range(N):
col_indices = range(n*(ii-1)+3, n*ii+3)
C31[:n, col_indices] = (x_common[ii]/x_elec)*np.eye(n)
C32 = np.zeros((n, n**2))
for ii in range(0, n):
col_indices = range(n*(ii-1)+3, n*ii+3)
C32[:n, col_indices] = w[ii]*np.eye(n)
C[N+1:, :N*n] = C31
C[N+1:, N*n:N*n+n**2] = C32
C[N+1:, N*n+n**2:] = np.eye(n)
# %% Building the initial estimate y0
# Technical coefficient matrix of the initial estimate
# As_y0 = zeros(N_tot,N_tot);
# As_y0(1:N,1:N) = A(1:N,1:N); %Common/Common part
# As_y0(1:N,N+1:N_tot) = repmat(A(1:N,N+1),1,n); %Common/New part
# As_y0(N+1:N_tot,1:N) = w*A(N+1,1:N); %New/Common part
# As_y0(N+1:N_tot,N+1:N_tot) = A(N+1,N+1)*repmat(w,1,n); %New/New part
As_y0 = np.zeros((N_tot, N_tot))
As_y0[:N, :N] = A.iloc[:N, :N]
As_y0[:N, N:N_tot] = np.repeat(A.iloc[:N, N].to_numpy(), n).reshape(N, n)
As_y0[N:N_tot, :N] = (
np.multiply(w, A.iloc[N, :N].to_numpy().repeat(n).reshape(N, n)).T
)
As_y0[N:N_tot, N:N_tot] = np.multiply(
A.iloc[N, N],
np.repeat(w, n).reshape(n, n)
)
# %% Generating the orthogonal distinguishing matrix
# %%% Making the constraint matrix orthogonal
# C_orth = C;
# for c = 1:Nc
# for i = 1:c-1
# C_orth(c,:) = C_orth(c,:) - dot(C_orth(c,:),C_orth(i,:))/norm(C_orth(i,:))^2*C_orth(i,:); %Orthogonal projection
# end
# end
C_orth = C.copy()
for c in tqdm(range(Nc), desc='Orthogonalize constraint matrix'):
for i in range(c):
C_orth[c, :] = (
C_orth[c, :]
- np.dot(C_orth[c, :], C_orth[i, :])
/ np.linalg.norm(C_orth[i, :])**2 * C_orth[i, :]
)
# %%% Gram-Schmidt algorithm
# base = zeros(Nv,Nv); %Orthogonal base containing C_orth and D
# base(1:Nc,:) = C_orth;
# for p = Nc+1:Nv
# base(p,:) = rand(1,Nv); %Generate random vector
# for i=1:p-1
# base(p,:) = base(p,:) - dot(base(p,:),base(i,:))/norm(base(i,:))^2*base(i,:); %Orthogonal projection on previous vectors
# end
# base(p,:) = base(p,:)/norm(base(p,:)); %Normalizing
# end
# D = transpose(base(Nc+1:end,:)); %Retrieving the distinguishing matrix from the orthogonal base
base = np.zeros((Nv, Nv))
base[:Nc, :] = C_orth.copy()
for p in tqdm(range(Nc, Nv), desc='Gram-Schmidt algorithm'):
base[p, :] = np.random.rand(1, Nv)
for i in range(p-1):
base[p, :] = (
base[p, :]
- np.dot(base[p, :], base[i, :])
/ np.linalg.norm(base[i, :])**2 * base[i, :]
)
base[p, :] = base[p, :] / np.linalg.norm(base[p, :])
D = base[Nc:, :].T
io-table-cn-2007-flows.csv
687.7,7,0.8,2223.1,0,167.6,0.7,66.4,0,25.9,255,0,3434.2,1420.5,4854.7
2.7,97,5.7,37.1,112,193.5,122.7,22.7,7.1,5.7,25.5,330.2,961.9,41.4,1003.3
0.6,1.3,114.8,11,1189.4,442.2,933.4,29.3,55.7,83.5,17.5,36.8,2915.5,62.3,2977.8
482.2,15.7,25,3813.9,15.8,326.7,98.6,370.1,3.3,171.3,1368.1,27.5,6718.2,4675.6,11393.8
39.4,13.6,89.2,46.2,121.4,463,298.4,83.7,3.4,126.7,771.3,127.5,2183.8,145.5,2329.3
379.8,27.1,122.8,885.2,48,3176.6,250.9,1098.6,7.4,1579,758.9,15.5,8349.8,1189.9,9539.7
14.6,69.3,86.6,136.6,10.3,228.8,2972.3,2684.5,4.7,1208.8,109.4,17.3,7543.2,1085.9,8629.1
58.6,98,197.2,307.8,50.1,339.4,683.5,6359,8.4,531.9,1331.4,295,10260.3,8754.1,19014.4
1.1,1.7,9.2,17.6,4.9,29.8,17.8,17.7,9.5,3,40.1,9.3,161.7,64.9,226.6
1.1,1.3,1.4,2.6,1.2,2.7,2.1,3.5,0.2,59.8,123.1,1,200,6018.7,6218.7
309.7,129.5,189,917.1,130.9,787.8,570.3,1366.1,27.1,942.5,3873.2,278.2,9521.4,10119.7,19641.1
45.8,60.2,174.7,171,48.3,436.4,367.9,214.1,25,82.7,276.1,1129.4,3031.6,241.8,3273.4
io-table-cn-2007-flows-idx.csv
Category,Cat
Agriculture,Ag
Coal minin and processing,CmP
Petroleaum processing and natural gas products,Pp
Food manufacturing and tobacco products,Fm
Petroleaum processing and coking,Ppc
Chemicals,Ch
Metal smelting and pressing,Msp
Machinery and equipment,M+e
Gas production and distribution,Gp+d
Construction,Co
Transport and warehousing,T+w
Electricity production and distribution,Ep+d
Intermediate demand,ID
Final demand,FD
Total output,TO
io-table-cn-2007-w.csv
0.241
0.648
0.111
io-table-cn-2007-w-idx.csv
Category,Cat
Hydro-electricity and others,Hy
Subcritical coal,SubC
Other fossil fuels,OFF
There are some minor issues with your Gram-Schmidt algorithm from above. Note I only checked that as you mentioned:
Up until the last step Gram-Schmidt algorithm everything appears to
work correctly (the variables' values are the same as far as I can
see). However, D is different.
First off, in your outer for loop, you run from Nc -> Nv which means that the random vector in the pth row of your base won't be orthogonalized - the Matlab scripts also runs Nc+1:Nv.
Secondly, (you got it with for-loops): You may run from 0 to p as the projection of the pth vector on the ith vector is the same (no matter if i is between 0 and p-1 or 1 and p-1).
Furthermore, I shortened to code by adding some syntactic sugar (-= and /=) - but besides this, your Gram-Schmidt implementation is the same as proposed in the Lidner 2012 paper.
# Orth. base containing both C_orth and D
base = np.zeros((Nv, Nv))
# C_orth is placed in the first :Nc rows of the base from above (c.f. Matlab code)
base[:Nc, :] = C_orth.copy()
# Generate random vectors for remaining rows
for p in range(Nc+1, Nv):
# Random vector
base[p, :] = np.random.rand(1, Nv)
# Orthogonal projection on previous vectors
for i in range(p):
# Subtract the projection of the pth vector on the ith vector
# from the pth vector - as described in the Paper by:
# base(p,:) = base(p,:)
# - dot(base(p,:),base(i,:))/norm(base(i,:))^2*base(i,:);
# Besides the syntax, it's the exact replication!
base[p, :] -= np.dot(base[p, :], base[i, :]) / np.linalg.norm(base[i, :])**2 * base[i, :]
# Normalize vector
base[p, :] /= np.linalg.norm(base[p, :])
# Retrieve matrix from the orthogonal base
D = base[Nc:, :].T
One thing I'd like to mention as to why your results may also differ: You might be using a different random number generator than in the paper -> You generate different random vectors!

Random portfolio simulation using dirichlet distribution with weight boundaries

I'm currently stuck at a problem which is on random portfolio simulation, however I'm struggling to generate these portfolios that fit into a certain constraints:
the code I have is below:
import numpy as np
from scipy import stats
# n is number of simulation, and width is number of assets
n ,width = 1000000, 38
bound = [0.02, 0.04]
np.random.seed(5) # Set seed
random_weights = np.random.dirichlet(np.ones(width), size = n)
# alphas = np.ones((width,))
# random_weights = np.abs(stats.dirichlet.rvs(alphas, size=n))
# Select only rows that meet weight condition:
cond1 = np.all(bound[0] <= random_weights, axis=1)
cond2 = np.all(random_weights <= bound[1], axis=1)
valid_rows = cond1*cond2
new_weights = random_weights[valid_rows, :]
new_weights end up being empty
I have also tried:
weights = np.random.random((n, width))
weights_sum = weights.sum(axis=1)
weights_sum = np.reshape(weights_sum, (n, 1))
# Standardise these weights so that they sum to 1
random_weights = weights / weights_sum
cond1 = np.all(bound[0] <= random_weights, axis=1)
cond2 = np.all(random_weights <= bound[1], axis=1)
valid_rows = cond1*cond2
new_weights = random_weights[valid_rows, :]
new_weights still ends up being empty
Would you be advise what a possible solution is and why that might be the case?
Ok, looking at Dirichlet and following https://math.stackexchange.com/questions/1439299/sum-of-squares-of-random-variables-that-satisfy-x-1-dotsx-n-1/1440385#1440385.
n = 38
E[Xi] = 1/n
Var[Xi] = (n-1)/(n+1) 1/n2 ~ 1/n2
StdDev[Xi] = sqrt(Var[Xi]) ~ 1/n
You're trying to get ALL 38 random numbers at once with mean 1/38=0.026 and stddev~0.026 to be inside [0.02...0.04] interval.
It would be extremely rare event.

Using map() on a function with multiple inputs to get rid of for loops

Context:
I have a function to upsample multiple arrays that I want to write as efficiently as possible (because I have to run it 370000 times).
This function takes multiple inputs and is composed of 2 for loops. To upsample my arrays, I loop over this function with a parameter k, and I would like to get rid of this loop (which sits outside of the function). I tried using a mix of map() and list-comprehension to minimize my computing time but I can't make it work.
Question:
How to get my map() part of the code working (see last section of code) ? Is there a better way than map() to get rid of for loops ?
Summary:
Function interpolate_and_get_results: 2 for loops. Takes 3D, 2D arrays and int as inputs
This function is ran inside a for loop (parameter k) that I want to get rid of
I wrote some example code, the part with map() does not work because I can't think of a way to pass the k parameter as a list, but also an input.
Thank you !
ps: code to parallelize the interpolation function that I do not use for this example
import numpy as np
import time
#%% --- SETUP OF THE PROBLEM --- %%#
temperatures = np.random.rand(10,4,7)*100
precipitation = np.random.rand(10,4,7)
snow = np.random.rand(10,4,7)
# Flatten the arrays to make them iterable with map()
temperatures = temperatures.reshape(10,4*7)
precipitation = precipitation.reshape(10,4*7)
snow = snow.reshape(10,4*7)
# Array of altitudes to "adjust" the temperatures
alt = np.random.rand(4,7)*1000
# Flatten the array
alt = alt.reshape(4*7)
# Weight Matrix
w = np.random.rand(4*7, 1000, 1000)
#%% Function
def interpolate_and_get_results(temp, prec, Eprec, w, i, k):
# Do some calculations
factor1 = ((temperatures[i,k]-272.15) + (-alt[k] * -6/1000))
factor2 = precipitation[i,k]
factor3 = snow[i,k]
# Iterate through every cell of the upsampled arrays
for i in range(w.shape[1]):
for j in range(w.shape[2]):
val = w[k, i, j]
temp[i, j] += factor1 * val
prec[i, j] += factor2 * val
Eprec[i, j] += factor3 * val
#%% --- Function call without loop simplification --- ##%
# Prepare a template array
dummy = np.zeros((w.shape[1], w.shape[2]))
# Initialize the global arrays to be filled
tempYEAR2 = np.zeros((9, dummy.shape[0], dummy.shape[1]))
precYEAR2 = np.zeros((9, dummy.shape[0], dummy.shape[1]))
EprecYEAR2 = np.zeros((9, dummy.shape[0], dummy.shape[1]))
ts = time.time()
for i in range(temperatures.shape[0]):
# Create empty host arrays
temp = dummy.copy()
prec = dummy.copy()
Eprec = dummy.copy()
for k in range(w.shape[0]):
interpolate_and_get_results(temp, prec, Eprec, w, i, k)
print('Time: ', (time.time()-ts))
#%% --- With Map (DOES NOT WORK) --- %%#
del k
dummy = np.zeros((w.shape[1], w.shape[2]))
# Initialize the global arrays to be filled
tempYEAR2 = np.zeros((9, dummy.shape[0], dummy.shape[1]))
precYEAR2 = np.zeros((9, dummy.shape[0], dummy.shape[1]))
EprecYEAR2 = np.zeros((9, dummy.shape[0], dummy.shape[1]))
# Create a list k to be iterated through with the map() function
k = [k for k in range(0, temperatures.shape[1])]
for i in range(temperatures.shape[0]):
# Create empty host arrays
temp = dummy.copy()
prec = dummy.copy()
Eprec = dummy.copy()
# Call the interpolate function with map() iterating through k
map(interpolate_and_get_results(temp, prec, Eprec, w, i, k), k)
Code from #Jérôme Richard using numba added at the request of user #ken (takes 48.81s to run on my pc):
import numpy as np
import multiprocessing as mp
import time
#%% ------ Create data ------ ###
temperatures = np.random.rand(10,4,7)*100
precipitation = np.random.rand(10,4,7)
snow = np.random.rand(10,4,7)
# Array of altitudes to "adjust" the temperatures
alt = np.random.rand(4,7)*1000
#%% ------ IDW Interpolation ------ ###
# We create a weight matrix that we use to upsample our temperatures, precipitations and snow matrices
# This part is not that important, it works well as it is
MX,MY = np.shape(temperatures[0])
N = 300
T = np.zeros([N*MX+1, N*MY+1])
# create NxM inverse distance weight matrices based on Gaussian interpolation
x = np.arange(0,N*MX+1)
y = np.arange(0,N*MY+1)
X,Y = np.meshgrid(x,y)
k = 0
w = np.zeros([MX*MY,N*MX+1,N*MY+1])
for mx in range(MX):
for my in range(MY):
# Gaussian
add_point = np.exp(-((mx*N-X.T)**2+(my*N-Y.T)**2)/N**2)
w[k,:,:] += add_point
k += 1
sum_weights = np.sum(w, axis=0)
for k in range(MX*MY):
w[k,:,:] /= sum_weights
#%% --- Function --- %%#
# Code from Jérôme Richard: https://stackoverflow.com/questions/72399050/parallelize-three-nested-loops/72399494?noredirect=1#comment127919686_72399494
import numba as nb
# get_results + interpolator
#nb.njit('void(float64[:,::1], float64[:,::1], float64[:,::1], float64[:,:,::1], int_, int_, int_, int_)', parallel=True)
def interpolate_and_get_results(temp, prec, Eprec, w, i, k, mx, my):
factor1 = ((temperatures[i,mx,my]-272.15) + (-alt[mx, my] * -6/1000))
factor2 = precipitation[i,mx,my]
factor3 = snow[i,mx,my]
# Filling the
for i in nb.prange(w.shape[1]):
for j in range(w.shape[2]):
val = w[k, i, j]
temp[i, j] += factor1 * val
prec[i, j] += factor2 * val
Eprec[i, j] += factor3 * val
#%% --- Main Loop --- %%#
ts = time.time()
if __name__ == '__main__':
dummy = np.zeros((w.shape[1], w.shape[2]))
# Initialize the permanent arrays to be filled
tempYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
precYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
EprecYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
smbYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
# Initialize semi-permanent array
smb = np.zeros((dummy.shape[0], dummy.shape[1]))
# Loop over the "time" axis
for i in range(0, temperatures.shape[0]):
# Create empty semi-permanent arrays
temp = dummy.copy()
prec = dummy.copy()
Eprec = dummy.copy()
# Loop over the different weights
for k in range(w.shape[0]):
# Loops over the cells of the array to be upsampled
for mx in range(MX):
for my in range(MY):
interpolate_and_get_results(temp, prec, Eprec, w, i, k, mx, my)
# At each timestep, update the semi-permanent array using the results from the interpolate function
smb[np.logical_and(temp <= 0, prec > 0)] += prec[np.logical_and(temp <= 0, prec > 0)]
# Fill the permanent arrays (equivalent of storing the results at the end of every year)
# and reinitialize the semi-permanent array every 5th timestep
if i%5 == 0:
# Permanent
tempYEAR[int(i/5)] = temp
precYEAR[int(i/5)] = prec
EprecYEAR[int(i/5)] = Eprec
smbYEAR[int(i/5)] = smb
# Semi-permanent
smb = np.zeros((dummy.shape[0], dummy.shape[1]))
print("Time spent:", time.time()-ts)
Note: This answer is not about how to use map, it's about "a better way".
You are doing a lot of redundant calculations. Believe it or not, this code outputs the same result.
# No change in the initialization section above.
ts = time.time()
if __name__ == '__main__':
dummy = np.zeros((w.shape[1], w.shape[2]))
# Initialize the permanent arrays to be filled
tempYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
precYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
EprecYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
smbYEAR = np.zeros((9, dummy.shape[0], dummy.shape[1]))
smb = np.zeros((dummy.shape[0], dummy.shape[1]))
temperatures_inter = temperatures - 272.15
w_inter = w.sum(axis=0)
alt_inter = (alt * (-6 / 1000)).sum()
for i in range(0, temperatures_inter.shape[0]):
temp_i = (temperatures_inter[i].sum() - alt_inter) * w_inter
prec_i = precipitation[i].sum() * w_inter
Eprec_i = snow[i].sum() * w_inter
condition = np.logical_and(temp_i <= 0, prec_i > 0)
smb[condition] += prec_i[condition]
if i % 5 == 0:
tempYEAR[i // 5] = temp_i
precYEAR[i // 5] = prec_i
EprecYEAR[i // 5] = Eprec_i
smbYEAR[i // 5] = smb
smb = np.zeros((dummy.shape[0], dummy.shape[1]))
print("Time spent:", time.time() - ts)
I verified the results by comparing them to the output of the code that uses numba. The difference is about 0.0000001, which is probably caused by rounding error.
print((tempYEAR_from_yours - tempYEAR_from_mine).sum()) # -8.429287845501676e-08
print((precYEAR_from_yours - precYEAR_from_mine).sum()) # 2.595697878859937e-09
print((EprecYEAR_from_yours - EprecYEAR_from_mine).sum()) # -7.430216442116944e-09
print((smbYEAR_from_yours - smbYEAR_from_mine).sum()) # -6.875431779462815e-09
On my PC, this code took 0.36 seconds. It does not use numba and is not even parallelized. It just eliminated redundancy.

efficient frontier/stock analyze

Consider the following task. Using a 10-year period I should calculate the portfolio weights in January and then use these weights in February to calculate the portfolio return and standard deviation. The program should then continue to calculate the weights In February and then use these weights in February to calculate the portfolio returns and standard deviation in marts. This should be done through all the 131 months in the data meaning I should only calculate the weights in the first month of the dataset.
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
p_ret = [] # Define an empty array for portfolio returns
p_vol = [] # Define an empty array for portfolio volatility
tickers = ['AAPL', 'AMZN', 'XOM']
start_date = datetime.date(2010, 1, 2)
end_date = datetime.date(2020, 12, 31)
daily_data = yf.download(tickers, start=start_date, end=end_date) # definere datasættet
daily_data = daily_data['Adj Close'].dropna()
Vector_of_ones = np.array([1,1,1])
frames = [v for _, v in daily_data.groupby(pd.Grouper(freq='M'))]
rf = 0.01 # risk free asset
weights = []
df = pd.DataFrame(columns=tickers)
for w in frames:
#corr_matrix = w.pct_change().apply(lambda x: np.log(1 + x)).corr()
mu = (w.resample('D').last().pct_change().sum())
individual_asset_return = np.subtract(np.transpose(mu), np.dot(Vector_of_ones,rf))
# individual_asset_return = daily_data.pct_change().mean() # finder gennemsnittet
df.loc[+1] = [individual_asset_return[tickers[0]], individual_asset_return[tickers[1]],
individual_asset_return[tickers[2]]]
df.index = df.index - 1
df = df.sort_index()
for d in range(len(df)):
cov_matrix = w.pct_change().apply(lambda x: np.log(1 + x)).cov()
liste = df.iloc[d].tolist()
a = np.dot(np.linalg.inv(cov_matrix), np.transpose(np.array(liste)))
omega_weights = a / (np.dot(np.transpose(Vector_of_ones), a)) # expression to find weights
weights.append(omega_weights)
for afkast in frames[1:]: #loop to find the portfolio returns and standard deviation
cov_matrix1 = afkast.pct_change().apply(lambda x: np.log(1 + x)).cov()
#corr_matrix1 = afkast.pct_change().apply(lambda x: np.log(1 + x)).corr()
df1 = df.iloc[1:, :]
for d1 in range(len(df)):
liste1 = df.iloc[d1].tolist()
portfolio_return = np.dot(np.transpose(omega_weights),
mu)
p_ret.append(portfolio_return)
volatility_portfolio = np.sqrt(np.dot(np.transpose(omega_weights), np.dot(cov_matrix1, omega_weights)))
p_vol.append(volatility_portfolio)
data = {'Returns': p_ret, 'Volatility': p_vol}
for counter, symbol in enumerate(afkast.columns.tolist()):
# print(counter, symbol)
data[symbol + ' weight'] = [w[counter] for w in weights]
portfolios = pd.DataFrame(data) # laver dataframe som sortere sådan at den med mindst volatility er øverst
portfolios['Date'] = pd.date_range(start=start_date, periods=len(portfolios), freq='M')
portfolios.plot(x='Date', y='Returns', kind='line')
# portfolios.plot(x = 'Date', y = 'Volatility', kind = 'line')
plt.show()
print(portfolios.head())
As you probably can see I’m not an advanced coder but I hope I could some help where my code is wrong if there is anything wrong.
I really appreciate any help you can provide.

Python Pandas code takes over a minute to process the data

I have two issues with the below code:
(a) It takes over a minutes to process a record
(b) I receive a Warning [A value is trying to be set on a copy of a slice from a DataFrame. See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy]
for stock in stocklist:
...
...
df['Up Move'] = np.nan
df['Down Move'] = np.nan
df['Average Up'] = np.nan
df['Average Down'] = np.nan
df['RS'] = np.nan
df['RSI'] = np.nan
for x in range(1, len(df)):
df['Up Move'][x] = 0
df['Down Move'][x] = 0
if df['Close'][x] > df['Close'][x-1]:
df['Up Move'][x] = df['Close'][x] - df['Close'][x-1]
if df['Close'][x] < df['Close'][x-1]:
df['Down Move'][x] = abs(df['Close'][x] - df['Close'][x-1])
#Calculate initial Average Up & Down, RS and RSI
df['Average Up'][14] = df['Up Move'][1:15].mean()
df['Average Down'][14] = df['Down Move'][1:15].mean()
df['RS'][14] = df['Average Up'][14] / df['Average Down'][14]
df['RSI'][14] = 100 - (100/(1+df['RS'][14]))
#Calculate rest of Average Up, Average Down, RS, RSI
for x in range(15, len(df)):
df['Average Up'][x] = (df['Average Up'][x-1]*13+df['Up Move'][x])/14
df['Average Down'][x] = (df['Average Down'][x-1]*13+df['Down Move'][x])/14
df['RS'][x] = df['Average Up'][x] / df['Average Down'][x]
df['RSI'][x] = 100 - (100/(1+df['RS'][x]))
df.drop(['Up Move', 'Down Move', 'Average Up', 'Average Down'], axis = 1, inplace = True)
Try to avoid using for loop to handle dataframe.
the code:
#Calculate rest of Average Up, Average Down, RS, RSI
for x in range(15, len(df)):
df['Average Up'][x] = (df['Average Up'][x-1]*13+df['Up Move'][x])/14
df['Average Down'][x] = (df['Average Down'][x-1]*13+df['Down Move'][x])/14
df['RS'][x] = df['Average Up'][x] / df['Average Down'][x]
df['RSI'][x] = 100 - (100/(1+df['RS'][x]))
may equal to:
df.loc[15:, 'Average Up'] = df.loc[15:, 'Average Up'].shift(-1)*13 + df.loc[15:, 'Up Move'][x]/14
df.loc[15:, 'Average Down'] = df.loc[15:, 'Average Down'].shift(-1)*13 + df.loc[15:, 'Up Down'][x]/14
df.loc[15:, 'RS'] = df.loc[15:, 'Average Up']/ df.loc[15:, 'Average Down']
df.loc[15:, 'RSI'] = 100 - (100/(1+df.loc[15:, 'RS'][x]))
Just found the code that gives me the same right result in a couple of seconds. The earlier code would take about one minute to three minutes.
# Compute RSI
def computeRSI (data, time_window):
diff = data.diff(1).dropna() # diff in one field(one day)
#this preservers dimensions off diff values
up_chg = 0 * diff
down_chg = 0 * diff
# up change is equal to the positive difference, otherwise equal to zero
up_chg[diff > 0] = diff[ diff>0 ]
# down change is equal to negative deifference, otherwise equal to zero
down_chg[diff < 0] = diff[ diff < 0 ]
# check pandas documentation for ewm
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
# values are related to exponential decay
# we set com=time_window-1 so we get decay alpha=1/time_window
up_chg_avg = up_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
down_chg_avg = down_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
rs = abs(up_chg_avg/down_chg_avg)
rsi = 100 - 100/(1+rs)
return rsi
# Calculate the RSI values using our function and add them to the dataframe.
df['RSI'] = computeRSI(df['Adj Close'], 14)
print(df.head())
print(df.tail())

Categories

Resources