I wrote a code a while ago that processes spectra using data from text files and performing calculations on them. I started with a code that just does everything line-by-line without any functions, and despite being long, it finishes running in 2.11 seconds (according to %%timeit). Below is that original code, labeled as such.
However, I wanted to put my code into functions instead, to allow for easier readability and usage with different models in the future. Even though I'm using all the same steps as I did before (but this time inside my functions), it is so much slower. This code is also below. Now, I have to wait for about 15-20 minutes to get the same outputs. Why is it so much slower, and is there any way I can make it significantly faster but still use functions?
Original Code:
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
filename = 'bpass_spectra.txt'
extinctionfile = 'ExtinctionLawPoints.txt' # from R_V = 4.0
pointslist = []
datalist = []
speclist = []
# Constants
Msun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Read spectra file
f = open(filename, 'r')
rawspectra = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawspectra[0])
del rawspectra[0]
for i in range(len(rawspectra)):
newlist = rawspectra[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
# Create new lists
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
z1list = [float(item[1]) for item in datalist]
speclist.extend(z1list)
met = met[0]
klist = [None]*len(speclist)
Loutlist = [None]*len(speclist)
Tlist = [None]*len(speclist)
# Define parameters
b = 2.0
R = 1.0
z = 1.0
Mgas = 1.0 # mass of gas, input
Mhalo = 2e41 # mass of dark matter halo, known
if float(met) > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*float(met)]*len(speclist)
elif float(met) <= 0.0052:
DGRlist = [((50.0*float(met))**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if float(Elist[i]) <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**b # extinction law [cm^2/g]
elif float(Elist[i]) > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = k_interp(Elist[i]) # interpolated function's value at Elist[i]
Mdustlist = [Mgas*DGR for DGR in DGRlist] # dust mass
Rhalo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10.0/(1.0+z))*((Mhalo/(1e8*Msun))**(1.0/3.0))
Rdust = 0.018*Rhalo # [kpc]
for i in range(len(speclist)):
Tlist[i] = 3*Mdustlist[i]*klist[i]/(4*np.pi*Rdust)
Linlist = [float(spectra)*R for spectra in speclist]
# Outgoing luminosity as function of wavelength
for i in range(len(Linlist)):
Loutlist[i] = Linlist[i]*np.exp(-Tlist[i])
# Test the calculation
print "LIN ELEMENTS 0 AND 1000:", Linlist[0], Linlist[1000]
print "LOUT ELEMENTS 0 AND 1000:", Loutlist[0], Loutlist[1000]
New "function-ized" Code (much slower):
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
# Required files and lists
filename = 'bpass_spectra.txt' # number of columns = 4
extinctionfile = 'ExtinctionLawPoints.txt' # R_V = 4.0
datalist = []
if filename == 'bpass_spectra.txt':
filetype = 4
else:
filetype = 1
if extinctionfile == 'ExtinctionLawPoints.txt':
R_V = 4.0
else:
R_V = 1.0 #to be determined
# Constants
M_sun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Inputs
beta = 2.0
R = 1.0
z = 1.0
M_gas = 1.0
M_halo = 2e41
# Read spectra file
f = open(filename, 'r')
rawlines = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawlines[0])
del rawlines[0]
for i in range(len(rawlines)):
newlist = rawlines[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
def interpolate(R_V, rawpoints, Elist, j):
pointslist = []
if R_V == 4.0:
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
return k_interp(Elist[j])
# Dust extinction function
def dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met):
speclist = []
if filetype == 4:
metallicity = float(met[0])
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
met1list = [float(item[1]) for item in datalist]
speclist.extend(met1list)
klist, Tlist = [None]*len(speclist), [None]*len(speclist)
if metallicity > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*metallicity]*len(speclist) # dust to gas ratio
elif metallicity <= 0.0052:
DGRlist = [((50.0*metallicity)**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if Elist[i] <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**beta # extinction law [cm^2/g]
elif Elist[i] > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = interpolate(R_V, rawpoints, Elist, i) # interpolated function's value at Elist[i]
Mdustlist = [M_gas*DGR for DGR in DGRlist] # dust mass
R_halo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10/(1+z))*((M_halo/(1e8*M_sun))**(1.0/3.0))
R_dust = 0.018*R_halo # [kpc]
# Optical depth calculation
Tlist = [3*Mdustlist[i]*klist[i]/(4*np.pi*R_dust) for i in range(len(speclist))]
# Ingoing and outgoing luminosities as functions of wavelength
Linlist = [float(spectra)*R for spectra in speclist]
Loutlist = [Linlist[i]*np.exp(-Tlist[i]) for i in range(len(speclist))]
return speclambdalist, Linlist, Loutlist
print dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)
Even when I only have the function return Loutlist instead of the tuple of 3 lists, it's still extremely slow. Any ideas on why this is? Also, I'm going to want to return the tuple and then plot speclambdalist versus Linlist, and also plot speclambdalist versus Loutlist on the same plot. But I'm under the impression that each time I call dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)[i] where i = 0, 1, or 2 (I'll be doing this multiple times), it'll have to run the function again each time. Is there any way to bypass these extra runs to further increase speed? Thank you!
Related
I'm trying to recreate the the original signal from the FFT of a signal sample. When taking Inverse FFT, I'm only getting an amplitude information (only one column). How can I get the corresponding time coordinates?
This is a screen shot of my original signal, recorded from 0 to 10s with step 0.001s. When I take the IFFT, I'm getting the same number of data points as my signal, but can't find the corresponding time information.
How can I get the correct time information?
I'm including the Python code code I used and a plot of the 2 signals.
#generating signal here
import numpy as np
k = float ( 3.1416*2)
f1 = 100
f2 = 150
f3 = 250
ds = max(f1,f2,f3)
ds = float(4*ds)
dt = 1.000/ds
lf = min (f1,f2,f3)
lT = 1.00/lf
N = 10 # cycles
totaltime = N*lT
data = []
tt = []
mf = 1/dt
print "TotalTime =", totaltime
for t in np.arange(0.0, totaltime,dt/100 ) :
#t = tk/mf
print t
wave1 = np.sin(k*f1*t)
wave2 = np.sin(k*f2*t)
wave3 = np.sin(k*f3*t)
summ = wave1 + wave2 + wave3
print t," ", summ
tt.append(t)
data.append(summ)
print tt
print data
np.savetxt("data.txt",np.c_[tt,data])
#######################
#taking the FFT here
fourier = []
tt =[]
yy=[]
logname = str("data.txt")
with open (logname,"rb") as wdata:
for line in wdata :
if not line.startswith("#") :
sl = line.split()
c11 = float(sl[0])
#c11 = c1*10**(-12)
c2 = float(sl[1])
tt.append(c11)
yy.append(c2)
n = len(yy)
n1 = len(tt)
print "n=",n,"(",n1,")"
#to calculate the time step , find the difference between 2 time-values
t0 = float(tt[0])
print "t0=",t0
t1 = float(tt[1])
print "t1=",t1
ts = t1 - t0
print "ts=", ts
yf = numpy.fft.fft(yy)
yf_abso = numpy.abs(yf)
freq = numpy.fft.fftfreq(n,d=ts)
numpy.savetxt('fft-data.txt',numpy.c_[freq,yf_abso])
######################
# taking the inverese FFT
filename = str("fft-data.txt")
FFTdata =[]
FREQdata = []
with open (filename,'r') as fftfile :
for line in fftfile :
if not line.startswith("#") :
split_line = line.split()
fpoint = float(split_line[1])
freqz = float(split_line[0])
FFTdata.append(fpoint)
FREQdata.append(freqz)
ireverse = np.fft.ifft(FFTdata)
reverse = np.abs(ireverse)
print type(reverse)
np.savetxt ("ireverse.txt", ireverse)
np.savetxt("reverse.txt", reverse)
The sample locations for the output of the IFFT are the same as those for the input to the FFT. You are doing that part right.
The output of the IFFT looks shifted, but it is not. What happens is that you threw away the phase information of the frequency spectrum when you saved it. You do
yf_abso = numpy.abs(yf)
and then save yf_abso. By taking the absolute value, you have thrown away important information. There is a reason that the FFT produces complex values. Throwing away half that information means you cannot reconstruct the original signal any more.
If you save the complex values, and use those in the last part of your code to compute the IFFT, then the real component of the output of the IFFT will match your input signal. The imaginary component there should be close to zero, different just due to numerical precision issues in floating-point computations.
I'm currently learning about PLA, and found something weird about my implement.
First of all, the data is linear separable, so it converges every time.
I implemented my PLA by visiting examples in a fixed, pre-determined random cycle, and repeated the experiment for a large number of times.
When I plot the number of updates v.s. frequency, I found out there are some number of update having frequency 0, for example, it never halts with 34, 36, 38, 40 updates, why is this?
Also, why isn't the plot a single peak curve but rather a 3 peak curve?
Train Data
My PLA plot
Here is my implement:
import numpy as np
import random as rd
import matplotlib.pyplot as plt
#Read Data
Data_X = []
Data_Y = []
Train_Data = "hw1_7_train"
f = open(Train_Data, "r")
if f.mode == "r":
fl = f.readlines()
for line in fl:
Data = line.split()
Data_X.append([1.0] + [float(i) for i in Data[0:4]])
Data_Y.append(int(Data[4]))
f.close()
Data_X = np.array(Data_X)
Data_Y = np.array(Data_Y)
def GoNext(Pos, Length):
if Pos < Length-1:
Pos += 1
else:
Pos = 0
return Pos
def sign(x):
return -1 if x <= 0 else 1
def PLA(X, Y, cycle):
weight = np.array([0.0] * 5)
Length = len(Y)
Success = 0
LastFail = -1
Current_Id = 0
UpdateCount = 0
while(not Success):
Current_Pos = cycle[Current_Id]
Sign = sign(np.inner(weight, X[Current_Pos]))
#If the current point is correct, check if we can halt, else continue
if Sign == Y[Current_Pos]:
#Went for a full round without mistake with initial weight (NOT likely to happen)
if LastFail == -1 and Current_Id == Length-1:
Success = 1
continue
#Went for a full round with out mistake from last point we modified
elif Current_Pos == LastFail:
Success = 1
continue
#Otherwise go to the next point
else:
Current_Id = GoNext(Current_Id, Length)
continue
#If the current point is error, modify the weight
else:
LastFail = Current_Pos
#Modify the weight
weight += Y[Current_Pos] * X[Current_Pos]
UpdateCount += 1
Current_Id = GoNext(Current_Id, Length)
continue
return UpdateCount
TotalIter = 9999
TrackUpdateCount = []
for iter in range(TotalIter):
Cycle = list(range(len(Data_Y)))
rd.shuffle(Cycle)
TrackUpdateCount.append(PLA(Data_X, Data_Y, Cycle))
print("Average Update: ", sum(TrackUpdateCount)/TotalIter)
#Plotting...
UpperBound = max(TrackUpdateCount)
LowerBound = min(TrackUpdateCount)
x_axis = list(range(LowerBound, UpperBound+1))
y_axis = [0]*(UpperBound-LowerBound+1)
for i in range(LowerBound, UpperBound+1):
y_axis[i-LowerBound] = TrackUpdateCount.count(i)
plt.bar(x_axis, y_axis)
plt.xlabel("Number of updates")
plt.ylabel("Frequency")
plt.show()
I'm trying to convert this PyMC2 example of an ERGM to PyMC3.
Using the documentation and other examples, I have come up with this code. It runs without throwing errors, but gives the wrong answers (the estimates are ~0). This first part is just setup: it is almost identical to the tutorial and works correctly.
import pymc3 as pm
import networkx as nx
import csv
import theano.tensor as tt
from theano.compile.ops import as_op
with open("grey_adjacency.tsv") as f:
first_line = f.readline()
adj = np.loadtxt("grey_adjacency.tsv", delimiter="\t",skiprows=1, usecols=list(range(1,45)))
G = nx.from_numpy_matrix(adj)
names = [name.strip() for name in first_line.split("\t")[1:]]
G = nx.relabel_nodes(G, {i: names[i] for i in range(44)})
node_attributes = []
with open("grey_nodes.tsv") as f:
reader = csv.DictReader(f, dialect=csv.excel_tab)
for row in reader:
node_attributes.append(row)
for node in node_attributes:
name = node["name"]
for key, val in node.items():
if key == "name":
continue
G.node[name][key] = val
matrix = nx.to_numpy_matrix(G)
matrix[np.triu_indices_from(matrix)] = 0
def edge_count(G):
size = len(G)
ones = np.ones((size, size))
# Zero out the upper triangle:
if not G.is_directed():
ones[np.triu_indices_from(ones)] = 0
return ones
def node_match(G, attrib):
size = len(G)
attribs = [node[1][attrib] for node in G.nodes(data=True)]
match = np.zeros(shape=(size, size))
for i in range(size):
for j in range(size):
if i != j and attribs[i] == attribs[j]:
match[i,j] = 1
if not G.is_directed():
match[np.triu_indices_from(match)] = 0
return match
# Create the gender-match matrix
gender_match_mat = node_match(G, "sex")
This next part is where I am having trouble converting the old PyMC2 code to use PyMC3:
#as_op(itypes=[tt.dmatrix, tt.dmatrix], otypes=[tt.dmatrix])
def probs(t1, t2):
probs = 1/(1+np.exp(-1*sum([t1, t2])))
probs[np.diag_indices_from(probs)] = 0
probs[np.triu_indices_from(probs)] = 0
return probs
with pm.Model():
density_coef = pm.Normal("density", mu=0, sd=0.001)
gender_match_coef = pm.Normal("gender_match", mu=0, sd=0.001)
density_term = density_coef * edge_count(G)
gender_match_term = gender_match_coef * gender_match_mat
ps = probs(density_term, gender_match_term)
outcome = pm.Bernoulli("outcome", p=ps, observed=matrix)
trace = pm.sample(5000, step=pm.Metropolis(), tune=500, njobs=1)
density_trace = trace["density"]
gender_match_trace = trace["gender_match"]
print("Density: {0:.3f}, {1:.3f}".format(np.mean(density_trace), np.std(density_trace)))
print("Gender: {0:.3f}, {1:.3f}".format(np.mean(gender_match_trace), np.std(gender_match_trace)))
Which gives the incorrect output:
Density: -0.000, 0.001
Gender: -0.000, 0.001
This answer suggests I could get it working by using tensors instead of the original example's numpy functions. I tried this:
#as_op(itypes=[tt.dmatrix, tt.dmatrix], otypes=[tt.dmatrix])
def probs(t1, t2):
probs = 1/(1+tt.exp(-1*tt.sum([t1, t2])))
probs = tt.fill_diagonal(probs, 0)
probs = tt.tril(probs)
return probs
But that gives the error:
TypeError: FillDiagonal: first parameter must have at least two dimensions
I also wonder if I'm even going about it in the right way? The documentation for logit uses the GLM instead, but I don't see how to use that approach for this problem.
I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.
I'm trying to do some bayesian probit code using data augmentation. I can get it to work if I loop over the rows of the output matrix, but I'd like to vectorize it and do it all in one shot (presumably that's faster).
import numpy as np
from numpy import random
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, truncnorm
##################################
### Create some simulated data ###
num_leg = 50
num_bills = 20
a = np.random.uniform(-1,1,num_bills).reshape(num_bills, 1)
b = np.random.uniform(-2,2,num_bills).reshape(num_bills, 1)
x = np.random.standard_normal(num_leg).reshape(num_leg, 1)
ystar_base = a + np.dot(b,x.T)
epsilon = np.random.standard_normal(num_leg * num_bills).reshape(num_bills, num_leg)
ystar = ystar_base + epsilon
y = 1*(ystar >0)
### Initialize some stuff I need ###
avec = [0]*num_bills # These are bill parameters
bvec = [0]*num_bills
betavec = [np.matrix(zip(avec,bvec))]
xvec = [0]*num_leg # these are legislator parameters
_ones = np.ones(num_leg)
def init_y(mat): # initialize a latent y matrix
if mat==1: return truncnorm.rvs(0,10000)
else: return truncnorm.rvs(-10000,0)
vectorize_y = np.vectorize(init_y)
latent_y = np.matrix(vectorize_y(y))
burn = 500 # How long to run the MCMC
runs = 500
### define the functions ###
def sample_params(xnow,ynow): # This is the function I'd like to vectorize
if type(xnow) == list:
xnow = np.array(xnow)
if type(ynow) == list:
ynow = np.array(ynow)
ynow = ynow.T #reshape(ynow.shape[0],1)
sigma = np.linalg.inv(np.dot(xnow.T,xnow)) ###This is the line that produces an error###
xy = np.dot(xnow.T,ynow)
mu = np.dot(sigma, xy) # this is just (x'x)inv x'y
return np.random.multivariate_normal(np.array(mu).flatten(), sigma)
vecparams = np.vectorize(sample_params)
def get_mu(xnow, bnow): # getting the updated mean to draw the latent ys
if type(xnow) == list:
xnow = np.array(xnow)
if type(bnow) == list:
bnow = np.array(bnow)
mu = np.dot(xnow,bnow.T)
mu = np.matrix(mu)
return mu
def sample_y(mu, ynow): # generate latent y matrix
if ynow==1:
a, b = (0 - mu),(10000-mu)
else:
a, b = (-10000 - mu),(0-mu)
return truncnorm.rvs(a,b)
vector_sample = np.vectorize(sample_y) # I'd like to be able to do something like this
### Here's the MCMC loop with the internal loop over rows(bills)
for i in range(burn+runs):
this_beta = []
this_x = []
this_y = []
for j in range(num_bills): #I'd like to get rid of this loop
ex = zip(x_ones, x)
newbeta = sample_params(ex, latent_y[j])
this_beta.append(newbeta)
#ex = np.array(zip(x_ones, x))
#this_beta = vecparams(ex, latent_y[:,]) # and call the vectorized function here
betavec.append(this_beta)
#Note, I can vectorize the latent outputs easily enough here
mean = get_mu(ex, betavec[-1])
latent_y = np.matrix(vector_sample(mean, np.matrix(y).T).T.reshape(latent_y.shape[0], latent_y.shape[1]))
### Now a bit of code to check to see if I've recovered what I want ###
test_beta = [zip(*(z)) for z in betavec[burn:]]
test_a = np.array([z[0] for z in test_beta])
test_b = np.array([z[1] for z in test_beta])
amean = test_a.sum(axis = 0)/float(runs)
bmean = test_b.sum(axis = 0)/float(runs)
print 'a mean'
print np.corrcoef([amean, np.array(a)])
print
print 'b mean'
print np.corrcoef([bmean, np.array(b)])
If I comment out the loop and use the commented out lines just above, I get the following error at the line I indicated earlier (the one that defines sigma):
LinAlgError: 0-dimensional array given. Array must be at least two-dimensional