Is there any numpy autocorrellation function with standardized output? - python

I followed the advice of defining the autocorrelation function in another post:
def autocorr(x):
result = np.correlate(x, x, mode = 'full')
maxcorr = np.argmax(result)
#print 'maximum = ', result[maxcorr]
result = result / result[maxcorr] # <=== normalization
return result[result.size/2:]
however the maximum value was not "1.0". therefore I introduced the line tagged with "<=== normalization"
I tried the function with the dataset of "Time series analysis" (Box - Jenkins) chapter 2. I expected to get a result like fig. 2.7 in that book. However I got the following:
anybody has an explanation for this strange not expected behaviour of autocorrelation?
Addition (2012-09-07):
I got into Python - programming and did the following:
from ClimateUtilities import *
import phys
#
# the above imports are from R.T.Pierrehumbert's book "principles of planetary
# climate"
# and the homepage of that book at "cambridge University press" ... they mostly
# define the
# class "Curve()" used in the below section which is not necessary in order to solve
# my
# numpy-problem ... :)
#
import numpy as np;
import scipy.spatial.distance;
# functions to be defined ... :
#
#
def autocorr(x):
result = np.correlate(x, x, mode = 'full')
maxcorr = np.argmax(result)
# print 'maximum = ', result[maxcorr]
result = result / result[maxcorr]
#
return result[result.size/2:]
##
# second try ... "Box and Jenkins" chapter 2.1 Autocorrelation Properties
# of stationary models
##
# from table 2.1 I get:
s1 = np.array([47,64,23,71,38,64,55,41,59,48,71,35,57,40,58,44,\
80,55,37,74,51,57,50,60,45,57,50,45,25,59,50,71,56,74,50,58,45,\
54,36,54,48,55,45,57,50,62,44,64,43,52,38,59,\
55,41,53,49,34,35,54,45,68,38,50,\
60,39,59,40,57,54,23],dtype=float);
# alternatively in order to test:
s2 = np.array([47,64,23,71,38,64,55,41,59,48,71])
##################################################################################3
# according to BJ, ch.2
###################################################################################3
print '*************************************************'
global s1short, meanshort, stdShort, s1dev, s1shX, s1shXk
s1short = s1
#s1short = s2 # for testing take s2
meanshort = s1short.mean()
stdShort = s1short.std()
s1dev = s1short - meanshort
#print 's1short = \n', s1short, '\nmeanshort = ', meanshort, '\ns1deviation = \n',\
# s1dev, \
# '\nstdShort = ', stdShort
s1sh_len = s1short.size
s1shX = np.arange(1,s1sh_len + 1)
#print 'Len = ', s1sh_len, '\nx-value = ', s1shX
##########################################################
# c0 to be computed ...
##########################################################
sumY = 0
kk = 1
for ii in s1shX:
#print 'ii-1 = ',ii-1,
if ii > s1sh_len:
break
sumY += s1dev[ii-1]*s1dev[ii-1]
#print 'sumY = ',sumY, 's1dev**2 = ', s1dev[ii-1]*s1dev[ii-1]
c0 = sumY / s1sh_len
print 'c0 = ', c0
##########################################################
# now compute autocorrelation
##########################################################
auCorr = []
s1shXk = s1shX
lenS1 = s1sh_len
nn = 1 # factor by which lenS1 should be divided in order
# to reduce computation length ... 1, 2, 3, 4
# should not exceed 4
#print 's1shX = ',s1shX
for kk in s1shXk:
sumY = 0
for ii in s1shX:
#print 'ii-1 = ',ii-1, ' kk = ', kk, 'kk+ii-1 = ', kk+ii-1
if ii >= s1sh_len or ii + kk - 1>=s1sh_len/nn:
break
sumY += s1dev[ii-1]*s1dev[ii+kk-1]
#print sumY, s1dev[ii-1], '*', s1dev[ii+kk-1]
auCorrElement = sumY / s1sh_len
auCorrElement = auCorrElement / c0
#print 'sum = ', sumY, ' element = ', auCorrElement
auCorr.append(auCorrElement)
#print '', auCorr
#
#manipulate s1shX
#
s1shX = s1shXk[:lenS1-kk]
#print 's1shX = ',s1shX
#print 'AutoCorr = \n', auCorr
#########################################################
#
# first 15 of above Values are consistent with
# Box-Jenkins "Time Series Analysis", p.34 Table 2.2
#
#########################################################
s1sh_sdt = s1dev.std() # Standardabweichung short
#print '\ns1sh_std = ', s1sh_sdt
print '#########################################'
# "Curve()" is a class from RTP ClimateUtilities.py
c2 = Curve()
s1shXfloat = np.ndarray(shape=(1,lenS1),dtype=float)
s1shXfloat = s1shXk # to make floating point from integer
# might be not necessary
#print 'test plotting ... ', s1shXk, s1shXfloat
c2.addCurve(s1shXfloat)
c2.addCurve(auCorr, '', 'Autocorr')
c2.PlotTitle = 'Autokorrelation'
w2 = plot(c2)
##########################################################
#
# now try function "autocorr(arr)" and plot it
#
##########################################################
auCorr = autocorr(s1short)
c3 = Curve()
c3.addCurve( s1shXfloat )
c3.addCurve( auCorr, '', 'Autocorr' )
c3.PlotTitle = 'Autocorr with "autocorr"'
w3 = plot(c3)
#
# well that should it be!
#

So your problem with your initial attempt is that you did not subtract the average from your signal. The following code should work:
timeseries = (your data here)
mean = np.mean(timeseries)
timeseries -= np.mean(timeseries)
autocorr_f = np.correlate(timeseries, timeseries, mode='full')
temp = autocorr_f[autocorr_f.size/2:]/autocorr_f[autocorr_f.size/2]
iact.append(sum(autocorr_f[autocorr_f.size/2:]/autocorr_f[autocorr_f.size/2]))
In my example temp is the variable you are interested in; it is the forward integrated autocorrelation function. If you want the integrated autocorrelation time you are interested in iact.

I'm not sure what the issue is.
The autocorrelation of a vector x has to be 1 at lag 0 since that is just the squared L2 norm divided by itself, i.e., dot(x, x) / dot(x, x) == 1.
In general, for any lags i, j in Z, where i != j the unit-scaled autocorrelation is dot(shift(x, i), shift(x, j)) / dot(x, x) where shift(y, n) is a function that shifts the vector y by n time points and Z is the set of integers since we're talking about the implementation (in theory the lags can be in the set of real numbers).
I get 1.0 as the max with the following code (start on the command line as $ ipython --pylab), as expected:
In[1]: n = 1000
In[2]: x = randn(n)
In[3]: xc = correlate(x, x, mode='full')
In[4]: xc /= xc[xc.argmax()]
In[5]: xchalf = xc[xc.size / 2:]
In[6]: xchalf_max = xchalf.max()
In[7]: print xchalf_max
Out[1]: 1.0
The only time when the lag 0 autocorrelation is not equal to 1 is when x is the zero signal (all zeros).
The answer to your question is: no, there is no NumPy function that automatically performs standardization for you.
Besides, even if it did you would still have to check it against your expected output, and if you're able to say "Yes this performed the standardization correctly", then I would assume that you know how to implement it yourself.
I'm going to suggest that it might be the case that you've implemented their algorithm incorrectly, although I can't be sure since I'm not familiar with it.

Related

Unable to reproduce Matlab-based results in Python

I've tried to implement a Matlab script by Lindner (2012) in Python. However, the final result D in my Python script diverges from the results which I am able to generate in an online Matlab environment (see pictures below). I ran rand('twister', 1337) in both scripts to make random numbers predictable.
Up until the last step Gram-Schmidt algorithm everything appears to work correctly (the variables' values are the same as far as I can see). However, D is different. Can anyone spot my mistake?
Lindner, Sören, Julien Legault, and Dabo Guan. 2012.
‘Disaggregating Input–Output Models with Incomplete Information’.
Economic Systems Research 24 (4): 329–47.
https://doi.org/10.1080/09535314.2012.689954.
The Matlab script is available via: https://www.tandfonline.com/doi/suppl/10.1080/09535314.2012.689954
Matlab output (first rows and cols) - authoritative:
Diverging Python output (first rows and cols):
"""Implementation of Lindner (2012) in Python with NumPy and Pandas.
Lindner, Sören, Julien Legault, and Dabo Guan. 2012.
‘Disaggregating Input–Output Models with Incomplete Information’.
Economic Systems Research 24 (4): 329–47.
https://doi.org/10.1080/09535314.2012.689954.
The comments in this script contain the Matlab code given in the supplementary
material 'cesr_a_689954_sup_27358897.docx' of Lindner (2012).
Source (accessed 06.12.2022):
https://www.tandfonline.com/doi/suppl/10.1080/09535314.2012.689954
The script contains one aspect of randomness. A random vector is
generated in line 90 of the Matlab script: `base(p,:) = rand(1,Nv)`. For verification purposes, `np.random.seed(1337)` (Python) and `rand('twister', 1337)` (Matlab) was applied.
"""
import numpy as np
import pandas as pd
from tqdm import tqdm
if True:
# Switch flag for verification
# Matlab equivalent: `rand('twister', 1337)`
# Source: https://stackoverflow.com/a/20202330/5696601
np.random.seed(1337)
# %% Loading data
# load('IOT_China.mat'); %Loading China's IO table
flows = pd.read_csv(
# Input–output table of China (2007), in billion RMB
'io-table-cn-2007-flows.csv',
header=None
)
flows_idx = pd.read_csv(
'io-table-cn-2007-flows-idx.csv'
)
flows.columns = pd.MultiIndex.from_frame(flows_idx)
flows.index = pd.MultiIndex.from_frame(flows_idx.iloc[:12, :])
# f = IOT_national(:,end-1); %Vector of final demand
f = flows.loc[:, ('Final demand', 'FD')]
# id = IOT_national(:,end-2); %Vector of intermediate demand
id = flows.loc[:, ('Intermediate demand', 'ID')]
# x = IOT_national(:,end); %Vector of total outputs
x = f + id
# Z = IOT_national(:,1:end-3); %Exchange matrix
Z = flows.loc[
# Rows
:,
# Cols
(~flows.columns.get_level_values('Cat')
.isin(['ID', 'FD', 'TO']))
]
del flows_idx
# temp = size(Z); %Size of IO table
temp = Z.shape
# N = temp(1)-1; %Number of common sectors
N = temp[0] - 1
# A = Z./repmat(transpose(x),N+1,1); %Aggregated technical coefficient matrix
A = np.divide(Z, x)
# x_common = x(1:end-1); %Vector of total outputs for common sectors
x_common = x[:-1]
# f_common = f(1:end-1); %Vector of final demand for common sectors
f_common = f[:-1]
# Note: The last sector of the table is disaggregated,
# i.e. the electricity sector
# x_elec = x(end); %Total output of the disaggregated sector
x_elec = x[-1]
# f_elec = f(end); %Final demand of the disaggregated sector
f_elec = f[-1]
# %% Newly formed sectors from the electricity sector
# n = 3; %Number of new sectors
# w = [0.241;0.648;0.111]; %New sector weights
w = pd.read_csv(
'io-table-cn-2007-w.csv',
header=None
)
w = w.values.flatten()
w_idx = pd.read_csv(
'io-table-cn-2007-w-idx.csv'
)
n = len(w)
# N_tot = N + n; %Total number of sectors for the disaggregated IO table
N_tot = N + n
# x_new = w.*x_elec; %Vector of new total sector outputs
x_new = w*x_elec/1000
# xs = [x_common;x_new]; %Vector of disaggregated economy sector total outputs
xs = np.concatenate((x_common, x_new))
# f_new = w*f_elec; %Final demand of new sectors
f_new = w*f_elec
# %% Building the constraint matrix C
# Nv = n*N_tot + n; %Number of variables
Nv = n * N_tot + n
# Nc = N + n + 1; %Number of constraints
Nc = N + n + 1
# q = [transpose(A(N+1,:));w]; %Vector of constraint constants
q = pd.concat(
[A.iloc[N, :],
pd.Series(w, index=pd.MultiIndex.from_frame(w_idx))]
)
# C = zeros(Nc,Nv); %Matrix of constraints
C = np.zeros((Nc, Nv))
# %% Common sectors constraints
# C11 = zeros(N,N*n);
# for ii = 1:N
# col_indices = n*(ii-1)+1:n*ii;
# C11(ii,col_indices) = ones(1,n);
# end
# C(1:N,1:N*n) = C11;
C11 = np.zeros((N, N*n))
for ii in range(N):
col_indices = range(n*(ii), n*ii+n)
C11[ii, col_indices] = np.ones((1, n))
C[:N, :N*n] = C11
# %% New sectors constraints
# C22 = zeros(1,n^2);
# for ii = 1:n
# col_indices = n*(ii-1)+1:n*ii;
# C22(1,col_indices) = w(ii)*ones(1,n);
# end
# C(N+1,N*n+1:N*n+n^2) = C22;
C22 = np.zeros((1, n**2))
for ii in range(0, n):
col_indices = range(n*(ii), n*ii+n)
C22[0, col_indices] = w[ii]*np.ones((1, n))
C[N, N*n:N*n+n**2] = C22
# %% Final demand constraints
# C31 = zeros(n,N*n);
# for ii = 1:N
# col_indices = n*(ii-1)+1:n*ii;
# C31(1:n,col_indices) = (x_common(ii)/x_elec)*eye(n,n);
# end
# C32 = zeros(n,n^2);
# for ii = 1:n
# col_indices = n*(ii-1)+1:n*ii;
# C32(1:n,col_indices) = w(ii)*eye(n,n);
# end
# C(N+2:end,1:N*n) = C31;
# C(N+2:end,N*n+1:N*n+n^2) = C32;
# C(N+2:end,N*n+n^2+1:end) = eye(n,n);
C31 = np.zeros((n, N*n))
for ii in range(N):
col_indices = range(n*(ii-1)+3, n*ii+3)
C31[:n, col_indices] = (x_common[ii]/x_elec)*np.eye(n)
C32 = np.zeros((n, n**2))
for ii in range(0, n):
col_indices = range(n*(ii-1)+3, n*ii+3)
C32[:n, col_indices] = w[ii]*np.eye(n)
C[N+1:, :N*n] = C31
C[N+1:, N*n:N*n+n**2] = C32
C[N+1:, N*n+n**2:] = np.eye(n)
# %% Building the initial estimate y0
# Technical coefficient matrix of the initial estimate
# As_y0 = zeros(N_tot,N_tot);
# As_y0(1:N,1:N) = A(1:N,1:N); %Common/Common part
# As_y0(1:N,N+1:N_tot) = repmat(A(1:N,N+1),1,n); %Common/New part
# As_y0(N+1:N_tot,1:N) = w*A(N+1,1:N); %New/Common part
# As_y0(N+1:N_tot,N+1:N_tot) = A(N+1,N+1)*repmat(w,1,n); %New/New part
As_y0 = np.zeros((N_tot, N_tot))
As_y0[:N, :N] = A.iloc[:N, :N]
As_y0[:N, N:N_tot] = np.repeat(A.iloc[:N, N].to_numpy(), n).reshape(N, n)
As_y0[N:N_tot, :N] = (
np.multiply(w, A.iloc[N, :N].to_numpy().repeat(n).reshape(N, n)).T
)
As_y0[N:N_tot, N:N_tot] = np.multiply(
A.iloc[N, N],
np.repeat(w, n).reshape(n, n)
)
# %% Generating the orthogonal distinguishing matrix
# %%% Making the constraint matrix orthogonal
# C_orth = C;
# for c = 1:Nc
# for i = 1:c-1
# C_orth(c,:) = C_orth(c,:) - dot(C_orth(c,:),C_orth(i,:))/norm(C_orth(i,:))^2*C_orth(i,:); %Orthogonal projection
# end
# end
C_orth = C.copy()
for c in tqdm(range(Nc), desc='Orthogonalize constraint matrix'):
for i in range(c):
C_orth[c, :] = (
C_orth[c, :]
- np.dot(C_orth[c, :], C_orth[i, :])
/ np.linalg.norm(C_orth[i, :])**2 * C_orth[i, :]
)
# %%% Gram-Schmidt algorithm
# base = zeros(Nv,Nv); %Orthogonal base containing C_orth and D
# base(1:Nc,:) = C_orth;
# for p = Nc+1:Nv
# base(p,:) = rand(1,Nv); %Generate random vector
# for i=1:p-1
# base(p,:) = base(p,:) - dot(base(p,:),base(i,:))/norm(base(i,:))^2*base(i,:); %Orthogonal projection on previous vectors
# end
# base(p,:) = base(p,:)/norm(base(p,:)); %Normalizing
# end
# D = transpose(base(Nc+1:end,:)); %Retrieving the distinguishing matrix from the orthogonal base
base = np.zeros((Nv, Nv))
base[:Nc, :] = C_orth.copy()
for p in tqdm(range(Nc, Nv), desc='Gram-Schmidt algorithm'):
base[p, :] = np.random.rand(1, Nv)
for i in range(p-1):
base[p, :] = (
base[p, :]
- np.dot(base[p, :], base[i, :])
/ np.linalg.norm(base[i, :])**2 * base[i, :]
)
base[p, :] = base[p, :] / np.linalg.norm(base[p, :])
D = base[Nc:, :].T
io-table-cn-2007-flows.csv
687.7,7,0.8,2223.1,0,167.6,0.7,66.4,0,25.9,255,0,3434.2,1420.5,4854.7
2.7,97,5.7,37.1,112,193.5,122.7,22.7,7.1,5.7,25.5,330.2,961.9,41.4,1003.3
0.6,1.3,114.8,11,1189.4,442.2,933.4,29.3,55.7,83.5,17.5,36.8,2915.5,62.3,2977.8
482.2,15.7,25,3813.9,15.8,326.7,98.6,370.1,3.3,171.3,1368.1,27.5,6718.2,4675.6,11393.8
39.4,13.6,89.2,46.2,121.4,463,298.4,83.7,3.4,126.7,771.3,127.5,2183.8,145.5,2329.3
379.8,27.1,122.8,885.2,48,3176.6,250.9,1098.6,7.4,1579,758.9,15.5,8349.8,1189.9,9539.7
14.6,69.3,86.6,136.6,10.3,228.8,2972.3,2684.5,4.7,1208.8,109.4,17.3,7543.2,1085.9,8629.1
58.6,98,197.2,307.8,50.1,339.4,683.5,6359,8.4,531.9,1331.4,295,10260.3,8754.1,19014.4
1.1,1.7,9.2,17.6,4.9,29.8,17.8,17.7,9.5,3,40.1,9.3,161.7,64.9,226.6
1.1,1.3,1.4,2.6,1.2,2.7,2.1,3.5,0.2,59.8,123.1,1,200,6018.7,6218.7
309.7,129.5,189,917.1,130.9,787.8,570.3,1366.1,27.1,942.5,3873.2,278.2,9521.4,10119.7,19641.1
45.8,60.2,174.7,171,48.3,436.4,367.9,214.1,25,82.7,276.1,1129.4,3031.6,241.8,3273.4
io-table-cn-2007-flows-idx.csv
Category,Cat
Agriculture,Ag
Coal minin and processing,CmP
Petroleaum processing and natural gas products,Pp
Food manufacturing and tobacco products,Fm
Petroleaum processing and coking,Ppc
Chemicals,Ch
Metal smelting and pressing,Msp
Machinery and equipment,M+e
Gas production and distribution,Gp+d
Construction,Co
Transport and warehousing,T+w
Electricity production and distribution,Ep+d
Intermediate demand,ID
Final demand,FD
Total output,TO
io-table-cn-2007-w.csv
0.241
0.648
0.111
io-table-cn-2007-w-idx.csv
Category,Cat
Hydro-electricity and others,Hy
Subcritical coal,SubC
Other fossil fuels,OFF
There are some minor issues with your Gram-Schmidt algorithm from above. Note I only checked that as you mentioned:
Up until the last step Gram-Schmidt algorithm everything appears to
work correctly (the variables' values are the same as far as I can
see). However, D is different.
First off, in your outer for loop, you run from Nc -> Nv which means that the random vector in the pth row of your base won't be orthogonalized - the Matlab scripts also runs Nc+1:Nv.
Secondly, (you got it with for-loops): You may run from 0 to p as the projection of the pth vector on the ith vector is the same (no matter if i is between 0 and p-1 or 1 and p-1).
Furthermore, I shortened to code by adding some syntactic sugar (-= and /=) - but besides this, your Gram-Schmidt implementation is the same as proposed in the Lidner 2012 paper.
# Orth. base containing both C_orth and D
base = np.zeros((Nv, Nv))
# C_orth is placed in the first :Nc rows of the base from above (c.f. Matlab code)
base[:Nc, :] = C_orth.copy()
# Generate random vectors for remaining rows
for p in range(Nc+1, Nv):
# Random vector
base[p, :] = np.random.rand(1, Nv)
# Orthogonal projection on previous vectors
for i in range(p):
# Subtract the projection of the pth vector on the ith vector
# from the pth vector - as described in the Paper by:
# base(p,:) = base(p,:)
# - dot(base(p,:),base(i,:))/norm(base(i,:))^2*base(i,:);
# Besides the syntax, it's the exact replication!
base[p, :] -= np.dot(base[p, :], base[i, :]) / np.linalg.norm(base[i, :])**2 * base[i, :]
# Normalize vector
base[p, :] /= np.linalg.norm(base[p, :])
# Retrieve matrix from the orthogonal base
D = base[Nc:, :].T
One thing I'd like to mention as to why your results may also differ: You might be using a different random number generator than in the paper -> You generate different random vectors!

Genetic Algorithm ported from matlab to python seems not to evolve

I want to port the working matlab code of a GA to python. In matlab I get to optimum within a 10% margin (good enough for a quick glance) with a population of 10 and 10K generations. Now I tried to port this code to python and get the odd behaviour that the solution seems stuck on a specific (but random) value sometimes way too far from the optimum.
A call of example1p6A(10000,10,0,0) using the provided matlab code results in
x*=
1.9707
3.0169
f(x*)=
0.9959
with some variance since it uses random numbers but is always close to above result.
When you've an open figure it will draw the evolution over the gererations with the mean wiggling around 0.8.
HOWEVER when I call the same function in my ported python code I get variing results like
x* = [1.89979667 3.39332522]
f_max = 0.5499656257996797
all over the place sometimes way off. What makes me wonder is that the plotted min, max, and median seem to be all the same after a few generations in stark contrast to the matlab plots.
I've spent the whole day trying to find my error but had no success.
Stuff I've been aware of are:
The indexes of vectors are starting from 0->N-1 in python instead of 1->N in matlab
The one for loop is explicitly floored since I got an error for uneven populations. Matlab seems to do this implicitly.
The equivalent to matlab size() in python is shape()
Now I'm out of ideas what might be the issue. I'll provide two plots so you can get a quick look at my results.
function example1p6A(NG,Np,rf,pf)
% example1p6A is a function to generate example 1.6A in
% Power Magnetic Devices: A Multi-Ojective Design Approach
% by S.D. Sudhoff
%
% Call:
% example1p6A(NG,Np,rf,pf)
%
% Input:
% NG = number of generations
% Np = size of population
% rf = report flag (set to 1 to write report)
% pf = plot flag (set to 1 to plot results)
%
% Output:
% All output is textual or graphical
%
% Internal:
% GA = structure of genetic algorithm parameters
% GA.Ng = number of genes
% GA.Np = number in population
% GA.NG = number of generations
% GA.pc = probability of crossover
% GA.pm = probabiligy of gene mutation
% GA.xmn = vector of minimum values for unnormalized genes
% GA.xmx = vector of maximum values for unnormalized genes
% g = generation counter
% P = the population (number of genes by number of individuals)
% PC = copy of the population
% x = uncoded gene values of population
% f = fitness of population
% M = mutated populatin
% fmn = mean fitness of population
% fmx = maximum fitness of population
% fmd = median fitness of population
%
% Version/Date:
% May 7, 2013
%
% Written by:
% S.D. Sudhoff
% Purdue University
% Electrical Engineering Building
% 465 Northwestern Avenue
% West Lafayette, IN 47907-2035
% sudhoff#ecn.purdue.edu
% 765-497-7648
GA.Ng=2; % number of genes
GA.Np=Np; % size of population
GA.NG=NG; % number of generations
GA.pc=0.5; % probability of crossover
GA.alpha=0.5; % blend ratio for crossover
GA.pm=0.1; % probability of a gene being mutated
GA.xmn=[0 0]; % vector of minimum values for unnormalized genes
GA.xmx=[5 5]; % vector of maximum values for unnormalized genes
% initialize the population
P=rand(GA.Ng,GA.Np);
% loop over the generation
for g=1:GA.NG
% make copy of current population
PC=P;
% find the parameter values for each member of the population
x=decode(P,GA);
% find the fitness
f=fitness(x);
if (g<GA.NG)
% find the mating pool
M=select(P,f,GA);
% create the next generation
P=children(M,GA);
end
fmn(g)=mean(f);
fmx(g)=max(f);
fmd(g)=median(f);
% plot
if pf
figure(1)
s=mod(g-1,9)+1;
symbol={'o','x','+','*','s','d','v','p','h'};
plot(x(1,:),x(2,:),symbol{s},'Color',[0.8 0.8 0.8]*(1-g/GA.NG));
hold on;
end
% text report
if rf
disp(['Generation = ' num2str(g)]);
disp('P=');
disp(PC)
disp('x=');
disp(x)
disp('f=');
disp(f)
if (g<GA.NG)
disp('M=');
disp(M);
end
disp(['mean f = ' num2str(fmn(g))]);
disp(['max f = ' num2str(fmx(g))]);
disp(['median f = ' num2str(fmd(g))]);
end
end % generation loop
% finish plots
% if (pf)
hold off
title('Parameter Values');
xlabel('x_1');
ylabel('x_2');
figure(2)
gen=1:GA.NG;
plot(gen,fmn,'r-',gen,fmd,'b',gen,fmx,'g');
legend('Mean','Median','Maximum');
xlabel('Generation');
ylabel('Fitness');
title('Evolution');
% end
% best fitness
[~,i]=max(f);
disp('x*=');
disp(x(:,i))
disp('f(x*)=');
disp(f(i));
end % example1p6A
function x=decode(P,GA)
% decode decodes genes to parameter values
%
% Inputs:
% P = population array
% GA = genetic algorithm parameters
%
% Outputs
% x = parameter array
x=zeros(size(P));
for g=1:GA.Ng
x(g,:)=GA.xmn(g)+(GA.xmx(g)-GA.xmn(g))*P(g,:);
end
end
function f=fitness(x)
% fitness computes the fitness
%
% Inputs:
% x = parameter array
%
% Outputs
% f = fitness
x1=x(1,:);
x2=x(2,:);
f=1./((x1.*x2-6).^2+4*(x2-3).^2+1);
end
function M=select(P,f,GA)
% select determines the mating pool
%
% Inputs:
% P = population array
% f = fitness
% GA = genetic algorithm parameters
%
% Outputs:
% M = mating pool array
M=zeros(size(P));
l1=randi([1 GA.Np],GA.Np,1);
l2=randi([1 GA.Np],GA.Np,1);
for i=1:GA.Np
i1=l1(i);
i2=l2(i);
if (f(i1)>=f(i2))
M(:,i)=P(:,i1);
else
M(:,i)=P(:,i2);
end
end
end
function C=children(M,GA)
% children forms children from the mating pool
%
% Inputs:
% M = mating pool array
% GA = genetic algorithm parameters
%
% Outputs:
% C = array of children
% perform simple blend crossover
C=zeros(size(M));
for i=1:GA.Np/2
i2=2*i;
i1=i2-1;
if GA.pc>rand
mn=0.5*(M(:,i1)+M(:,i2));
df=(M(:,i2)-M(:,i1))*GA.alpha*rand;
C(:,i1)=mn+df;
C(:,i2)=mn-df;
else
C(:,i1)=M(:,i1);
C(:,i2)=M(:,i2);
end
end
% mutation
R=rand(GA.Ng,GA.Np);
index=GA.pm>rand(GA.Ng,GA.Np);
C(index)=R(index);
% gene repair
index=C>1;
C(index)=1;
index=C<0;
C(index)=0;
end
import numpy as np
import math, statistics
import matplotlib.pyplot as plt
def example1p6A(NG, Np, rf, pf):
# example1p6A is a function to generate example 1.6A in
# Power Magnetic Devices: A Multi-Ojective Design Approach
# by S.D. Sudhoff
#
# Call:
# example1p6A(NG,Np,rf,pf)
#
# Input:
# NG = number of generations
# Np = size of population
# rf = report flag (set to 1 to write report)
# pf = plot flag (set to 1 to plot results)
#
# Output:
# All output is textual or graphical
#
# Internal:
# GA = structure of genetic algorithm parameters
# GA.Ng = number of genes
# GA.Np = number in population
# GA.NG = number of generations
# GA.pc = probability of crossover
# GA.pm = probabiligy of gene mutation
# GA.xmn = vector of minimum values for unnormalized genes
# GA.xmx = vector of maximum values for unnormalized genes
# g = generation counter
# P = the population (number of genes by number of individuals)
# PC = copy of the population
# x = uncoded gene values of population
# f = fitness of population
# M = mutated populatin
# fmn = mean fitness of population
# fmx = maximum fitness of population
# fmd = median fitness of population
#
# Version/Date:
# May 7, 2013
#
# Written by:
# S.D. Sudhoff
# Purdue University
# Electrical Engineering Building
# 465 Northwestern Avenue
# West Lafayette, IN 47907-2035
# sudhoff#ecn.purdue.edu
# 765-497-7648
GA = dict(
Np=np.array(Np),
NG=np.array(NG),
Ng=np.array(2),
pc=np.array(0.5),
alpha=np.array(0.5),
pm=np.array(0.1),
xmn=np.array([0, 0]),
xmx=np.array([5, 5]))
# Init population
P = np.random.rand(GA["Ng"],GA["Np"])
# empty bins fol plotting
fmean = []
fmax = []
fmed = []
# loop over the generations
for g in range(GA["NG"]):
PC = P
x = decode(P,GA)
f=fitness(x)
# if g < GA["NG"]: # g ist immer kleiner als GA["NG"] weil range() von 0->(n-1) geht
# M = select(P,f,GA)
# P = children(M,GA)
M = select(P,f,GA)
P = children(M,GA)
fmean.append(statistics.mean(f))
fmax.append(max(f))
fmed.append(statistics.median(f))
print("x* = {}".format(x[:,1]))
print("f_max = {}\n".format(f.max()))
# plot that stuff
plt.style.use("fivethirtyeight")
plt.plot(np.arange(1,GA["NG"]+1),fmean, label = "mean")
plt.plot(np.arange(1,GA["NG"]+1),fmed, label = "median")
plt.plot(np.arange(1,GA["NG"]+1),fmax, label = "max")
plt.xlabel("Generation")
plt.ylabel("Fitness")
plt.title("Evolution")
plt.axis([0,GA["NG"],0,1.2])
plt.tight_layout()
plt.legend()
plt.show()
# return P
def decode(P,GA):
x = np.zeros(np.shape(P))
for g in range(GA["Ng"]):
x[g,:] = GA["xmn"][g]+(GA["xmx"][g]-GA["xmn"][g])*P[g,:]
return x
def fitness(x):
# fitness computes the fitness
#
# Inputs:
# x = parameter array
#
# Outputs
# f = fitness
x1 = x[0,:]
x2 = x[1,:]
f=1/((x1*x2-6)**2+4*(x2-3)**2+1)
return f
def select(P,f,GA):
# select determines the mating pool
#
# Inputs:
# P = population array
# f = fitness
# GA = genetic algorithm parameters
#
# Outputs:
# M = mating pool array
M = np.zeros(np.shape(P))
l1 = np.random.randint(1,GA["Np"]+1,(GA["Np"],1)) #randint is [lower...upper)
l2 = np.random.randint(1,GA["Np"]+1,(GA["Np"],1))
for i in range(GA["Np"]):
i1 = l1[i][0]
i2 = l2[i][0]
if f[i1-1] >= f[i2-1]:
#bei matlab beginnt der index bei 1. python hingegen startet bei 0 und endet bei n-1
M[:,i] = P[:,i1-1]
else:
M[:,i] = P[:,i2-1]
return M
def children(M,GA):
# children forms children from the mating pool
#
# Inputs:
# M = mating pool array
# GA = genetic algorithm parameters
# # Outputs:
# C = array of children
C = np.zeros(np.shape(M))
# perform simple blend crossover
C = np.zeros(np.shape(M))
for i in range(1,math.floor(GA["Np"]/2)+1):
i2 = 2*i
i1 = i2-1
rnd = np.random.rand()
if GA["pc"] > rnd:
mn= 0.5*(M[:,i1-1]+M[:,i2-1]) #the index starts from 0 but the counter from 1
df=(M[:,i2-1]-M[:,i1-1])*GA["alpha"]*np.random.rand()
C[:,i1-1]=mn+df
C[:,i2-1]=mn-df
else:
C[:,i1-1]=M[:,i1-1]
C[:,i2-1]=M[:,i2-1]
# Mutation
R=np.random.rand(GA["Ng"],GA["Np"])
index = GA["pm"] > np.random.rand(GA["Ng"],GA["Np"])
C[index]=C[index]
# Gene repair
index= C>1
C[index] = 1
index =C <0
C[index] = 0
return C
example1p6A(10000,10,0,0)
Although I've found one error C[index] = C[index] where it should have been C[index]=R[index] I've compltetely ditched my above code and wrote it from scratch with plenty print()-commands to see what each step is doing. Now I have working code as follows:
import numpy as np
import matplotlib.pyplot as plt
import math, statistics
def mygen(NG,Np,rf,pf):
GA = dict(
Ng = np.array(2),
Np = np.array(Np),
NG = np.array(NG),
pc = np.array(0.5),
alpha = np.array(0.5),
pm = np.array(0.1),
xmn = np.array([0,0]),
xmx = np.array([5,5])
)
# Init population
P = np.random.rand(GA['Ng'],GA['Np'])
# print("Iinitial population:\n{}".format(P))
# Empty bins for median and stuff
fmean = []
fmed = []
fmax = []
index_of_max = 0
for g in range(GA['NG']):
# print("Generation {}:".format(g))
x = decode(P,GA)
# print("\n\n")
f = fitness(x)
# print("fitness f() for each individual:\n{}".format(f))
index_of_max = np.unravel_index(np.argmax(f,axis=None),f.shape)
fmean.append(np.mean(f))
fmed.append(np.median(f))
fmax.append(f[index_of_max])
# print("Worst fitness for Individual #{}:{}".format(index_of_min,f[index_of_min]))
# print("Best fitness for Individual #{}:{}".format(index_of_max,f[index_of_max]))
# Find the mating pool
M = select(P,f,GA)
# Create new population
P = children(M,GA)
# Final output
# index = np.unravel_index(np.argmax(f))
i = index_of_max
print("x*:{}".format(x[:,i]))
print("f(x*)={}".format(f[i]))
# Final plots
plt.style.use("fivethirtyeight")
plt.plot(np.arange(1,GA["NG"]+1),fmean,label = "mean")
plt.plot(np.arange(1,GA["NG"]+1),fmax,label = "max")
plt.plot(np.arange(1,GA["NG"]+1),fmed,label = "med")
plt.xlabel("Generation")
plt.ylabel("Fitness")
plt.title("Evolution")
plt.axis([0,GA["NG"],0,1.2])
# plt.xticks(np.arange(1,GA["NG"]+1, 5))
plt.tight_layout()
plt.legend()
plt.show()
def decode(P,GA):
x=np.zeros(np.shape(P))
for g in range(GA['Ng']):
x[g,:] = GA['xmn'][g]+(GA['xmx'][g]-GA['xmn'][g])*P[g,:]
# print("decode()\n x{}:\n{}".format(g,x))
return x
def fitness(x):
x1 = x[0,:]
x2 = x[1,:]
f = 1/((x1*x2-6)**2+4*(x2-3)**2+1)
return f
def select(P,f,GA):
M = np.zeros(np.shape(P))
l1 = np.random.randint(0,GA["Np"],(GA["Np"],1))
l2 = np.random.randint(0,GA["Np"],(GA["Np"],1))
for i in range(GA["Np"]):
i1 = l1[i,0]
i2 = l2[i,0]
if f[i1] >= f[i2]:
M[:,i] = P[:,i1]
# print(f[i1])
else:
M[:,i] = P[:,i2]
# print(f[i2])
# print(i1)
return M
def children(M,GA):
C = np.zeros(np.shape(M))
# Simple blend crossover
for i in range(math.floor(GA["Np"]/2)):
i2 = 2*(i+1)
i1 = i2-1
if GA["pc"] > np.random.rand():
mn = 0.5*(M[:,i1-1]+M[:,i2-1])
df = (M[:,i2-1]-M[:,i1-1])*GA["alpha"]*np.random.rand()
C[:,i1-1] = mn+df
C[:,i2-1] = mn-df
else:
C[:,i1-1] = M[:,i1-1]
C[:,i2-1] = M[:,i2-1]
# Mutation
R = np.random.rand(GA["Ng"],GA["Np"])
index = GA["pm"]>np.random.rand(GA["Ng"],GA["Np"])
C[index] = R[index]
# Gene repair
index =C>1
C[index]=1
index =C<0
C[index]=0
return C
mygen(15,50,1,0)

Why do I get divergence convergence testing in Python code error and crumb error?

This is the code:
import numpy as np
import pandas as pd
import statsmodels
from statsmodels.tsa.stattools import coint
# just set the seed for the random number generator
np.random.seed(107)
import matplotlib.pyplot as plt
Generate a fake security X and model it’s daily returns by drawing from a normal distribution. Then perform a cumulative sum to get the value of X on each day.
# Generate daily returns
Xreturns = np.random.normal(0, 1, 100)
# sum them and shift all the prices up
X = pd.Series(np.cumsum(
Xreturns), name='X') + 50
X.plot(figsize=(15,7))
plt.show()
Generate Y which has deep economic link to X, so price of Y should vary pretty similarly as X.
noise = np.random.normal(0, 1, 100)
Y = X + 5 + noise
Y.name = 'Y'
pd.concat([X, Y], axis=1).plot(figsize=(15,7))
plt.show()
Plot the ratio between the two:
(Y/X).plot(figsize=(15,7))
plt.axhline((Y/X).mean(), color='red', linestyle='--')
plt.xlabel('Time')
plt.legend(['Price Ratio', 'Mean'])
plt.show()
# compute the p-value of the cointegration test
# will inform us as to whether the ratio between the 2 timeseries is stationary
# around its mean
score, pvalue, _ = coint(X,Y)
print (pvalue)
ret1 = np.random.normal(1, 1, 100)
ret2 = np.random.normal(2, 1, 100)
s1 = pd.Series( np.cumsum(ret1), name='X')
s2 = pd.Series( np.cumsum(ret2), name='Y')
pd.concat([s1, s2], axis=1 ).plot(figsize=(15,7))
plt.show()
print 'Correlation: ' + str(X_diverging.corr(Y_diverging))
score, pvalue, _ = coint(X_diverging,Y_diverging)
print 'Cointegration test p-value: ' + str(pvalue)
Error Message:
File "", line 9
print 'Correlation: ' + str(X_diverging.corr(Y_diverging))
SyntaxError: invalid syntax
Y2 = pd.Series(np.random.normal(0, 1, 800), name='Y2') + 20
Y3 = Y2.copy()
Y3[0:100] = 30
Y3[100:200] = 10
Y3[200:300] = 30
Y3[300:400] = 10
Y3[400:500] = 30
Y3[500:600] = 10
Y3[600:700] = 30
Y3[700:800] = 10
Y2.plot(figsize=(15,7))
Y3.plot()
plt.ylim([0, 40])
plt.show()
# correlation is nearly zero
print 'Correlation: ' + str(Y2.corr(Y3))
score, pvalue, _ = coint(Y2,Y3)
print 'Cointegration test p-value: ' + str(pvalue)
Error message:File "", line 14
print 'Correlation: ' + str(Y2.corr(Y3))
SyntaxError: invalid syntax
def find_cointegrated_pairs(data):
n = data.shape[1]
score_matrix = np.zeros((n, n))
pvalue_matrix = np.ones((n, n))
keys = data.keys()
pairs = []
for i in range(n):
for j in range(i+1, n):
S1 = data[keys[i]]
S2 = data[keys[j]]
result = coint(S1, S2)
score = result[0]
pvalue = result[1]
score_matrix[i, j] = score
pvalue_matrix[i, j] = pvalue
if pvalue < 0.02:
pairs.append((keys[i], keys[j]))
return score_matrix, pvalue_matrix, pairs
pip install auquan-toolbox and execute following code snippet:
from backtester.dataSource.yahoo_data_source import YahooStockDataSource
from datetime import datetime
startDateStr = '2007/12/01'
endDateStr = '2017/12/01'
cachedFolderName = 'yahooData/'
dataSetId = 'testPairsTrading'
instrumentIds = ['SPY','AAPL','ADBE','SYMC','EBAY','MSFT','QCOM',
'HPQ','JNPR','AMD','IBM']
ds = YahooStockDataSource(cachedFolderName=cachedFolderName,
dataSetId=dataSetId,
instrumentIds=instrumentIds,
startDateStr=startDateStr,
endDateStr=endDateStr,
event='history')
data = ds.getBookDataByFeature()['Adj Close']
data.head(3)
Error message:File "C:\ProgramData\Anaconda3\lib\site-packages\backtester\dataSource\data_source_utils.py", line 25, in getCookieForYahoo
return cookie, crumb # return a tuple of crumb and cookie
UnboundLocalError: local variable 'crumb' referenced before assignment
Complete article with code can be found here.
Any help is much appreciated. Thank you.
On errors regarding the print function you miss parentheses.
Eg. this error:
Error Message: File "", line 9 print 'Correlation: ' + str(X_diverging.corr(Y_diverging)) SyntaxError: invalid syntax
is caused by the line 9:
print 'Correlation: ' + str(X_diverging.corr(Y_diverging))
which should be:
print('Correlation: ' + str(X_diverging.corr(Y_diverging)))

How to build multi-dimensional neural network using Python NLTK

As part of a larger project, I'm trying to build a neural network that is built using two independent training sets against one input training series. In other words, I have a single input array that I want to build two independent synapses to be trained against.
In simplest terms, it looks something like this:
x = [[0,0,0],[0,1,1],[0,1,0]...] y = [[0,1],[0,0],[1,0]...] z =
[[0,0,0,0,1],[1,0,0,0,0],[1,1,0,1,1]...]
where y and z would be trained as independent synapses against trainset x.
I've been unable to modify my code to get this to work and I know it can't be super complex but I have been stuck for over a day and need some help.
Below is the code that I can't seem to get to work using multiple dimensions.
# Import Necessary Modules for Function
import numpy as np
import time
# Computer a non-linear Sigmoid curve (__--)
def sigmoid(x):
sigmoidOutput = 1/(1+np.exp(-x))
return sigmoidOutput
# Convert the output of the signmoid function to its derivative
def sigmoidDerivative(sigmoidOutput):
return sigmoidOutput*(1-sigmoidOutput)
def cleanSentence(sentence):
# Tokenize the words within the input sentence
sentenceWords = nltk.word_tokenize(sentence)
# Stem the words within the tokenized setence
sentenceWords = [stemmer.stem(userInput.lower()) for userInput in sentenceWords]
return sentenceWords
# Return a binary bag of words [0 or 1] to evaluate whether or not a word exists within
# a sentence.
def bagWordCheck(sentence, userInput, show_details=False):
# Tokenize the sentence
sentenceWords = cleanSentence(sentence)
# Create a word bag using the training-data from the user-input
wordBag = [0]*len(userInput)
for sWord in sentenceWords:
for i,w in enumerate(userInput):
if w == sWord:
wordBag[i] = 1
if show_details:
print ("found in bag: %s" % w)
return(np.array(wordBag))
# Evaluate the user's input
def think(sentence, showDetails=False):
x = bagWordCheck(sentence.lower(), userInput, showDetails)
if showDetails:
print ("sentence:", sentence, "\n bagWordCheck:", x)
# input layer is our bag of words
l0 = x
# matrix multiplication of input and hidden layer
l1 = sigmoid(np.dot(l0, synapse0))
# output layer
l2 = sigmoid(np.dot(l1, synapse2))
return l2
# ANN and Gradient Descent code from https://iamtrask.github.io//2015/07/27/python-network-part2/
def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5):
print ("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') )
print ("Input matrix: %sx%s Output matrix: %sx%s" % (len(X),len(X[0]),1, len(conversationTypes)) )
np.random.seed(1)
lastMeanError = 1
# randomly initialize our weights with mean 0
synapse0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
synapse2 = 2*np.random.random((hidden_neurons, len(conversations))) - 1
#synapse2 = 2*np.random.random((hidden_neurons, len(conversationTypes))) - 1
#synapse2 = 2*np.random.random((hidden_neurons, len(conversationSubjects))) - 1
prev_synapse0_weight_update = np.zeros_like(synapse0)
prev_synapse2_weight_update = np.zeros_like(synapse2)
synapse0_direction_count = np.zeros_like(synapse0)
synapse2_direction_count = np.zeros_like(synapse2)
for j in iter(range(epochs+1)):
# Feed forward through layers 0, 1, and 2
layer_0 = X
layer_1 = sigmoid(np.dot(layer_0, synapse0))
if(dropout):
layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
layer_2 = sigmoid(np.dot(layer_1, synapse2))
# how much did we miss the target value?
layer_2_error = y - layer_2
if (j% 10000) == 0 and j > 5000:
# if this 10k iteration's error is greater than the last iteration, break out
if np.mean(np.abs(layer_2_error)) < lastMeanError:
print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
lastMeanError = np.mean(np.abs(layer_2_error))
else:
print ("break:", np.mean(np.abs(layer_2_error)), ">", lastMeanError )
break
# in what direction is the target value?
# were we really sure? if so, don't change too much.
layer_2_delta = layer_2_error * sigmoidDerivative(layer_2)
# how much did each l1 value contribute to the l2 error (according to the weights)?
layer_1_error = layer_2_delta.dot(synapse2.T)
# in what direction is the target l1?
# were we really sure? if so, don't change too much.
layer_1_delta = layer_1_error * sigmoidDerivative(layer_1)
synapse2_weight_update = (layer_1.T.dot(layer_2_delta))
synapse0_weight_update = (layer_0.T.dot(layer_1_delta))
if(j > 0):
synapse0_direction_count += np.abs(((synapse0_weight_update > 0)+0) - ((prev_synapse0_weight_update > 0) + 0))
synapse2_direction_count += np.abs(((synapse2_weight_update > 0)+0) - ((prev_synapse2_weight_update > 0) + 0))
synapse2 += alpha * synapse2_weight_update
synapse0 += alpha * synapse0_weight_update
prev_synapse0_weight_update = synapse0_weight_update
prev_synapse2_weight_update = synapse2_weight_update
now = datetime.datetime.now()
# persist synapses
synapse = {'synapse0': synapse0.tolist(), 'synapse2': synapse2.tolist(),
'datetime': now.strftime("%Y-%m-%d %H:%M"),
'userInput': userInput,
'conversations' : conversations,
'conversationTypes': conversationTypes,
'conversationSubjects' : conversationSubjects
}
synapse_file = "synapses.json"
with open(synapse_file, 'w') as outfile:
json.dump(synapse, outfile, indent=4, sort_keys=True)
print ("saved synapses to:", synapse_file)
X = np.array(trainingSet)
y = np.array(completeConversations)
y1 = np.array(completeTypes)
y2 = np.array(completeSubjects)
start_time = time.time()
train(X, y, hidden_neurons=5, alpha=0.1, epochs=30000, dropout=False, dropout_percent=0.2)
#train(X, y1, hidden_neurons=5, alpha=0.1, epochs=30000, dropout=False, dropout_percent=0.2)
#train(X, y2, hidden_neurons=5, alpha=0.1, epochs=30000, dropout=False, dropout_percent=0.2)
elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")
# probability threshold
ERROR_THRESHOLD = 0.75
# load our calculated synapse values
synapse_file = 'synapses.json'
with open(synapse_file) as data_file:
synapse = json.load(data_file)
synapse0 = np.asarray(synapse['synapse0'])
synapse2 = np.asarray(synapse['synapse2'])
def classify(sentence, showDetails=False):
results = think(sentence, showDetails)
results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ]
results.sort(key=lambda x: x[1], reverse=True)
return_results =[[conversations[r[0]],r[1]] for r in results]
#return_results =[[conversationSubjects[r[0]],r[1]] for r in results]
#return_results =[[conversationTypes[r[0]],r[1]] for r in results]
return return_results
classify("charlotte explorer is not letting me ")
Please help!

Using numpy vectorize

I'm trying to do some bayesian probit code using data augmentation. I can get it to work if I loop over the rows of the output matrix, but I'd like to vectorize it and do it all in one shot (presumably that's faster).
import numpy as np
from numpy import random
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, truncnorm
##################################
### Create some simulated data ###
num_leg = 50
num_bills = 20
a = np.random.uniform(-1,1,num_bills).reshape(num_bills, 1)
b = np.random.uniform(-2,2,num_bills).reshape(num_bills, 1)
x = np.random.standard_normal(num_leg).reshape(num_leg, 1)
ystar_base = a + np.dot(b,x.T)
epsilon = np.random.standard_normal(num_leg * num_bills).reshape(num_bills, num_leg)
ystar = ystar_base + epsilon
y = 1*(ystar >0)
### Initialize some stuff I need ###
avec = [0]*num_bills # These are bill parameters
bvec = [0]*num_bills
betavec = [np.matrix(zip(avec,bvec))]
xvec = [0]*num_leg # these are legislator parameters
_ones = np.ones(num_leg)
def init_y(mat): # initialize a latent y matrix
if mat==1: return truncnorm.rvs(0,10000)
else: return truncnorm.rvs(-10000,0)
vectorize_y = np.vectorize(init_y)
latent_y = np.matrix(vectorize_y(y))
burn = 500 # How long to run the MCMC
runs = 500
### define the functions ###
def sample_params(xnow,ynow): # This is the function I'd like to vectorize
if type(xnow) == list:
xnow = np.array(xnow)
if type(ynow) == list:
ynow = np.array(ynow)
ynow = ynow.T #reshape(ynow.shape[0],1)
sigma = np.linalg.inv(np.dot(xnow.T,xnow)) ###This is the line that produces an error###
xy = np.dot(xnow.T,ynow)
mu = np.dot(sigma, xy) # this is just (x'x)inv x'y
return np.random.multivariate_normal(np.array(mu).flatten(), sigma)
vecparams = np.vectorize(sample_params)
def get_mu(xnow, bnow): # getting the updated mean to draw the latent ys
if type(xnow) == list:
xnow = np.array(xnow)
if type(bnow) == list:
bnow = np.array(bnow)
mu = np.dot(xnow,bnow.T)
mu = np.matrix(mu)
return mu
def sample_y(mu, ynow): # generate latent y matrix
if ynow==1:
a, b = (0 - mu),(10000-mu)
else:
a, b = (-10000 - mu),(0-mu)
return truncnorm.rvs(a,b)
vector_sample = np.vectorize(sample_y) # I'd like to be able to do something like this
### Here's the MCMC loop with the internal loop over rows(bills)
for i in range(burn+runs):
this_beta = []
this_x = []
this_y = []
for j in range(num_bills): #I'd like to get rid of this loop
ex = zip(x_ones, x)
newbeta = sample_params(ex, latent_y[j])
this_beta.append(newbeta)
#ex = np.array(zip(x_ones, x))
#this_beta = vecparams(ex, latent_y[:,]) # and call the vectorized function here
betavec.append(this_beta)
#Note, I can vectorize the latent outputs easily enough here
mean = get_mu(ex, betavec[-1])
latent_y = np.matrix(vector_sample(mean, np.matrix(y).T).T.reshape(latent_y.shape[0], latent_y.shape[1]))
### Now a bit of code to check to see if I've recovered what I want ###
test_beta = [zip(*(z)) for z in betavec[burn:]]
test_a = np.array([z[0] for z in test_beta])
test_b = np.array([z[1] for z in test_beta])
amean = test_a.sum(axis = 0)/float(runs)
bmean = test_b.sum(axis = 0)/float(runs)
print 'a mean'
print np.corrcoef([amean, np.array(a)])
print
print 'b mean'
print np.corrcoef([bmean, np.array(b)])
If I comment out the loop and use the commented out lines just above, I get the following error at the line I indicated earlier (the one that defines sigma):
LinAlgError: 0-dimensional array given. Array must be at least two-dimensional

Categories

Resources