I'm trying to convert this PyMC2 example of an ERGM to PyMC3.
Using the documentation and other examples, I have come up with this code. It runs without throwing errors, but gives the wrong answers (the estimates are ~0). This first part is just setup: it is almost identical to the tutorial and works correctly.
import pymc3 as pm
import networkx as nx
import csv
import theano.tensor as tt
from theano.compile.ops import as_op
with open("grey_adjacency.tsv") as f:
first_line = f.readline()
adj = np.loadtxt("grey_adjacency.tsv", delimiter="\t",skiprows=1, usecols=list(range(1,45)))
G = nx.from_numpy_matrix(adj)
names = [name.strip() for name in first_line.split("\t")[1:]]
G = nx.relabel_nodes(G, {i: names[i] for i in range(44)})
node_attributes = []
with open("grey_nodes.tsv") as f:
reader = csv.DictReader(f, dialect=csv.excel_tab)
for row in reader:
node_attributes.append(row)
for node in node_attributes:
name = node["name"]
for key, val in node.items():
if key == "name":
continue
G.node[name][key] = val
matrix = nx.to_numpy_matrix(G)
matrix[np.triu_indices_from(matrix)] = 0
def edge_count(G):
size = len(G)
ones = np.ones((size, size))
# Zero out the upper triangle:
if not G.is_directed():
ones[np.triu_indices_from(ones)] = 0
return ones
def node_match(G, attrib):
size = len(G)
attribs = [node[1][attrib] for node in G.nodes(data=True)]
match = np.zeros(shape=(size, size))
for i in range(size):
for j in range(size):
if i != j and attribs[i] == attribs[j]:
match[i,j] = 1
if not G.is_directed():
match[np.triu_indices_from(match)] = 0
return match
# Create the gender-match matrix
gender_match_mat = node_match(G, "sex")
This next part is where I am having trouble converting the old PyMC2 code to use PyMC3:
#as_op(itypes=[tt.dmatrix, tt.dmatrix], otypes=[tt.dmatrix])
def probs(t1, t2):
probs = 1/(1+np.exp(-1*sum([t1, t2])))
probs[np.diag_indices_from(probs)] = 0
probs[np.triu_indices_from(probs)] = 0
return probs
with pm.Model():
density_coef = pm.Normal("density", mu=0, sd=0.001)
gender_match_coef = pm.Normal("gender_match", mu=0, sd=0.001)
density_term = density_coef * edge_count(G)
gender_match_term = gender_match_coef * gender_match_mat
ps = probs(density_term, gender_match_term)
outcome = pm.Bernoulli("outcome", p=ps, observed=matrix)
trace = pm.sample(5000, step=pm.Metropolis(), tune=500, njobs=1)
density_trace = trace["density"]
gender_match_trace = trace["gender_match"]
print("Density: {0:.3f}, {1:.3f}".format(np.mean(density_trace), np.std(density_trace)))
print("Gender: {0:.3f}, {1:.3f}".format(np.mean(gender_match_trace), np.std(gender_match_trace)))
Which gives the incorrect output:
Density: -0.000, 0.001
Gender: -0.000, 0.001
This answer suggests I could get it working by using tensors instead of the original example's numpy functions. I tried this:
#as_op(itypes=[tt.dmatrix, tt.dmatrix], otypes=[tt.dmatrix])
def probs(t1, t2):
probs = 1/(1+tt.exp(-1*tt.sum([t1, t2])))
probs = tt.fill_diagonal(probs, 0)
probs = tt.tril(probs)
return probs
But that gives the error:
TypeError: FillDiagonal: first parameter must have at least two dimensions
I also wonder if I'm even going about it in the right way? The documentation for logit uses the GLM instead, but I don't see how to use that approach for this problem.
Related
I wrote a code a while ago that processes spectra using data from text files and performing calculations on them. I started with a code that just does everything line-by-line without any functions, and despite being long, it finishes running in 2.11 seconds (according to %%timeit). Below is that original code, labeled as such.
However, I wanted to put my code into functions instead, to allow for easier readability and usage with different models in the future. Even though I'm using all the same steps as I did before (but this time inside my functions), it is so much slower. This code is also below. Now, I have to wait for about 15-20 minutes to get the same outputs. Why is it so much slower, and is there any way I can make it significantly faster but still use functions?
Original Code:
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
filename = 'bpass_spectra.txt'
extinctionfile = 'ExtinctionLawPoints.txt' # from R_V = 4.0
pointslist = []
datalist = []
speclist = []
# Constants
Msun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Read spectra file
f = open(filename, 'r')
rawspectra = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawspectra[0])
del rawspectra[0]
for i in range(len(rawspectra)):
newlist = rawspectra[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
# Create new lists
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
z1list = [float(item[1]) for item in datalist]
speclist.extend(z1list)
met = met[0]
klist = [None]*len(speclist)
Loutlist = [None]*len(speclist)
Tlist = [None]*len(speclist)
# Define parameters
b = 2.0
R = 1.0
z = 1.0
Mgas = 1.0 # mass of gas, input
Mhalo = 2e41 # mass of dark matter halo, known
if float(met) > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*float(met)]*len(speclist)
elif float(met) <= 0.0052:
DGRlist = [((50.0*float(met))**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if float(Elist[i]) <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**b # extinction law [cm^2/g]
elif float(Elist[i]) > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = k_interp(Elist[i]) # interpolated function's value at Elist[i]
Mdustlist = [Mgas*DGR for DGR in DGRlist] # dust mass
Rhalo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10.0/(1.0+z))*((Mhalo/(1e8*Msun))**(1.0/3.0))
Rdust = 0.018*Rhalo # [kpc]
for i in range(len(speclist)):
Tlist[i] = 3*Mdustlist[i]*klist[i]/(4*np.pi*Rdust)
Linlist = [float(spectra)*R for spectra in speclist]
# Outgoing luminosity as function of wavelength
for i in range(len(Linlist)):
Loutlist[i] = Linlist[i]*np.exp(-Tlist[i])
# Test the calculation
print "LIN ELEMENTS 0 AND 1000:", Linlist[0], Linlist[1000]
print "LOUT ELEMENTS 0 AND 1000:", Loutlist[0], Loutlist[1000]
New "function-ized" Code (much slower):
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
# Required files and lists
filename = 'bpass_spectra.txt' # number of columns = 4
extinctionfile = 'ExtinctionLawPoints.txt' # R_V = 4.0
datalist = []
if filename == 'bpass_spectra.txt':
filetype = 4
else:
filetype = 1
if extinctionfile == 'ExtinctionLawPoints.txt':
R_V = 4.0
else:
R_V = 1.0 #to be determined
# Constants
M_sun = 1.98892e30 # solar mass [kg]
h = 4.1357e-15 # Planck's constant [eV s]
c = float(3e8) # speed of light [m/s]
# Inputs
beta = 2.0
R = 1.0
z = 1.0
M_gas = 1.0
M_halo = 2e41
# Read spectra file
f = open(filename, 'r')
rawlines = f.readlines()
met = re.findall('Z\s=\s(\d*\.\d+)', rawlines[0])
del rawlines[0]
for i in range(len(rawlines)):
newlist = rawlines[i].split(' ')
datalist.append(newlist)
# Read extinction curve data file
rawpoints = open(extinctionfile, 'r').readlines()
def interpolate(R_V, rawpoints, Elist, j):
pointslist = []
if R_V == 4.0:
for i in range(len(rawpoints)):
newlst = re.split('(?!\S)\s(?=\S)|(?!\S)\s+(?=\S)', rawpoints[i])
pointslist.append(newlst)
pointslist = pointslist[3:]
lambdalist = [float(item[0]) for item in pointslist]
k_abslist = [float(item[4]) for item in pointslist]
xvallist = [(c*h)/(lamb*1e-6) for lamb in lambdalist]
k_interp = scipy.interpolate.interp1d(xvallist, k_abslist)
return k_interp(Elist[j])
# Dust extinction function
def dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met):
speclist = []
if filetype == 4:
metallicity = float(met[0])
Elist = [float(item[0]) for item in datalist]
speclambdalist = [h*c*1e9/E for E in Elist]
met1list = [float(item[1]) for item in datalist]
speclist.extend(met1list)
klist, Tlist = [None]*len(speclist), [None]*len(speclist)
if metallicity > 0.0052:
DGRlist = [50.0*np.exp(-2.21)*metallicity]*len(speclist) # dust to gas ratio
elif metallicity <= 0.0052:
DGRlist = [((50.0*metallicity)**3.15)*np.exp(-0.96)]*len(speclist)
for i in range(len(speclist)):
if Elist[i] <= 4.1357e-3: # frequencies <= 10^12 Hz
klist[i] = 0.1*(float(Elist[i])/(1000.0*h))**beta # extinction law [cm^2/g]
elif Elist[i] > 4.1357e-3: # frequencies > 10^12 Hz
klist[i] = interpolate(R_V, rawpoints, Elist, i) # interpolated function's value at Elist[i]
Mdustlist = [M_gas*DGR for DGR in DGRlist] # dust mass
R_halo = 0.784*(0.27**2.0)*(0.7**(-2.0/3.0))*float(10/(1+z))*((M_halo/(1e8*M_sun))**(1.0/3.0))
R_dust = 0.018*R_halo # [kpc]
# Optical depth calculation
Tlist = [3*Mdustlist[i]*klist[i]/(4*np.pi*R_dust) for i in range(len(speclist))]
# Ingoing and outgoing luminosities as functions of wavelength
Linlist = [float(spectra)*R for spectra in speclist]
Loutlist = [Linlist[i]*np.exp(-Tlist[i]) for i in range(len(speclist))]
return speclambdalist, Linlist, Loutlist
print dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)
Even when I only have the function return Loutlist instead of the tuple of 3 lists, it's still extremely slow. Any ideas on why this is? Also, I'm going to want to return the tuple and then plot speclambdalist versus Linlist, and also plot speclambdalist versus Loutlist on the same plot. But I'm under the impression that each time I call dust(interpolate, filetype, datalist, beta, R, z, M_gas, M_halo, met)[i] where i = 0, 1, or 2 (I'll be doing this multiple times), it'll have to run the function again each time. Is there any way to bypass these extra runs to further increase speed? Thank you!
I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.
I'm currently trying to find the intercept of 2 equations from my code (pasted below). I'm using fsolve and have used it successfully in one part but I can't get it to work for the second.
Confusingly it's not showing up an error, if you paste this code into your notebook and run it you'll see 2 grphs, on the first graph there's a line at an angle which should be stopping at the eqm line.
The section which wont work is def q_eqm(x_q). Thank you for your help
import numpy as np
import scipy.optimize as opt
import matplotlib.pyplot as plt
AC_LK = np.array([4.02232,1206.53,220.291])
AC_HK = np.array([4.0854,1348.77,219.976])
P_Tot = 1 # Bara
N_Size = 11 # 1001 = 0.1% accuracy for xA
xf = 0.7
q = 0.7
xA = np.linspace(0,1,N_Size)
yA = np.linspace(0.00,0.00,N_Size)
T = np.linspace(0.00,0.00,N_Size)
x = np.array([xA[0:N_Size],yA[0:N_Size],T[0:N_Size]]) # x[xA,yA,T]
F = np.empty((1))
def xA_T(N):
xA_Ant = x[0,N]
def P_Ant(T):
PA = pow(10,AC_LK[0]-(AC_LK[1]/(T+AC_LK[2])))*xA_Ant
PB = pow(10,AC_HK[0]-(AC_HK[1]/(T+AC_HK[2])))*(1-xA_Ant)
F[0] = P_Tot - (PA + PB)
return F
return x
TGuess = [100]
T = opt.fsolve(P_Ant,TGuess)
x[2,N] = T
return x
for N in range(0,len(xA)):
xA_T(N)
x[1,N] = pow(10,AC_LK[0]-(AC_LK[1]/(x[2,N]+AC_LK[2])))*x[0,N]/P_Tot
q_int = ((-q*0)/(1-q)) + (xf/(1-q))
Eqm_Poly = np.polyfit(x[0,0:N_Size], x[1,0:N_Size], 6)
q_Poly = np.polyfit([xf,0], [xf,q_int], 1)
F = np.empty((1))
def q_Eqm(x_q):
y_q = q_Poly[0]*x_q + q_Poly[1]
eqm_y = (Eqm_Poly[0]*pow(x_q,6)+Eqm_Poly[1]*pow(x_q,5)+Eqm_Poly[2]*pow(x_q,4)+Eqm_Poly[3]*pow(x_q,3)+Eqm_Poly[4]*pow(x_q,2)+Eqm_Poly[5]*pow(x_q,1)+Eqm_Poly[6]*pow(x_q,0))
F[0] = y_q - eqm_y
return F
x_qGuess = [0]
x_q = opt.fsolve(q_Eqm,x_qGuess)
print(x,Eqm_Poly,x_q,q_int)
plt.plot(x[0,0:N_Size],x[1,0:N_Size],'k-',linewidth=1)
plt.plot([xf,xf],[0,xf],'b-',linewidth=1)
plt.plot([xf,x_q],[xf,(q_Poly[0]*x_q + q_Poly[1])],'r-',linewidth=1)
plt.legend(['Eqm','Feed'])
plt.xlabel('xA')
plt.ylabel('yA')
plt.xlim([0.00, 1])
plt.ylim([0.00, 1])
plt.savefig('x.png')
plt.savefig('x.eps')
plt.show()
plt.plot(x[0,0:N_Size],x[2,0:N_Size],'r--',linewidth=3)
plt.plot(x[1,0:N_Size],x[2,0:N_Size],'b--',linewidth=3)
plt.legend(['xA','yA'])
plt.xlabel('Mol Frac')
plt.ylabel('Temp degC')
plt.xlim([0, 1])
plt.savefig('Txy.png')
plt.savefig('Txy.eps')
plt.show()
The answer turns out to be relatively simple:
#F = np.empty((1)) # remove this
def q_Eqm(x_q):
y_q = q_Poly[0]*x_q + q_Poly[1]
eqm_y = (Eqm_Poly[0]*pow(x_q,6)+Eqm_Poly[1]*pow(x_q,5)+Eqm_Poly[2]*pow(x_q,4)+Eqm_Poly[3]*pow(x_q,3)+Eqm_Poly[4]*pow(x_q,2)+Eqm_Poly[5]*pow(x_q,1)+Eqm_Poly[6]*pow(x_q,0))
return y_q - eqm_y
The original code defines a global F, which is modified in the function and then returned. So in each iteration the function returns different values but they are the same object. This seems to confuse fsolve (I guess it internally stores references to the results rather than values). Removing this F and simply returning the result of the subtraction resolves the problem.
I'm currently trying to draw some edges in Networkx, my nodes have 2 patch properties, position and status which are used in a colonisation simulation algorithm. I've been trying to scale up my simulation which has meant turning away from working out euclidean distances between my nodes (and also away from code that works!).
I have a csv of the row number index of the nearest neighbours of each node, this index corresponding to the row of another csv which has the 3d co-ordinates of the nodes contained. i.e., on the nearest neighbour csv on row 0 may have 3 nearest neighbours on the same row in separate columns so it would be 0, 56, 76 if node 0 had nearest neighbours in node 56 and 76 which would correspond to rows 0, 56 and 76 on the co-ord csv.
I then need to draw edges between these nearest neighbour nodes for my algorithm to play with the nodes. So I have some pseudo-code:
import networkx as nx
import numpy as np
from sklearn.neighbors import BallTree
import csv
from itertools import izip_longest
import pandas as pd
density = 0.14 #Stellar density per cubic parsec
L = 100
Patches = int(0.056*density*L**3+15)
P_init = 0.0001 # Probability that a patch will be occupied at the beginning
Distance = 10
dat = np.random.uniform(low = -1, high = 1, size = (Patches,3)) * L
np.savetxt('nearand1.csv', dat, delimiter = ',')
nearand = np.genfromtxt('nearand1.csv', delimiter = ',',usecols=np.arange(0, 3))
tree = BallTree(nearand, leaf_size=2)
ind = tree.query_radius(nearand, r=10)
df = pd.DataFrame(ind)
df.to_csv('bobbington4.csv',sep='e',index=False, header=False)
xcoord = nearand[:,0]
ycoord = nearand[:,1]
zcoord = nearand[:,2]
bobbington = np.genfromtxt('bobbington4.csv', delimiter = ',', dtype = 'int')
bobbington0 = bobbington[:,0]
bobbington1 = bobbington[:,1]
bobbington2 = bobbington[:,2]
bobbington3 = bobbington[:,3]
bobbington4 = bobbington[:,4]
bobbington5 = bobbington[:,5]
bobbington6 = bobbington[:,6]
bobbington7 = bobbington[:,7]
bobbington8 = bobbington[:,8]
bobbington9 = bobbington[:,9]
bobbington10 = bobbington[:,10]
bobbington11 = bobbington[:,11]
bobbington12 = bobbington[:,12]
bobbington13 = bobbington[:,13]
class patch:
def __init__(self,status=0,pos=(0,0,0)):
self.status = status
self.pos = pos
def __str__(self):
return(str(self.status))
G = nx.Graph()
for i in xrange(Patches):
Stat = 1 if np.random.uniform() < P_init else 0
Pos = (xcoord[i], ycoord[i], zcoord[i])
G.add_node(patch(Stat,Pos))
for i in G.nodes():
for j in G.nodes():
if i.pos where i == bobbington0:
if j.pos where j == bobbington1:
G.add_edge(i,j)
pos = {}
for n in G.nodes():
pos[n] = n.pos
occup = [n.status for n in G]
Time = [0]
Occupancy = [np.sum([n.status for n in G])/float(Patches)]
Here bobbington0 is just a column of node indices going from 0 -> 7854 and bibbington1 is the first nearest neighbour for each of those nodes. What is is the best way to go about this? I'm struggling to find anything on this type of problem but I'm probably wording things poorly.
Thanks in advance for any help you can give me.
I've got it. Not particularly elegant but it works.
for i in G.nodes():
for j in G.nodes():
diff1 = j.boba[0] - i.bubu
if diff1 == 0:
G.add_edge(i, j)
I'm trying to do some bayesian probit code using data augmentation. I can get it to work if I loop over the rows of the output matrix, but I'd like to vectorize it and do it all in one shot (presumably that's faster).
import numpy as np
from numpy import random
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, truncnorm
##################################
### Create some simulated data ###
num_leg = 50
num_bills = 20
a = np.random.uniform(-1,1,num_bills).reshape(num_bills, 1)
b = np.random.uniform(-2,2,num_bills).reshape(num_bills, 1)
x = np.random.standard_normal(num_leg).reshape(num_leg, 1)
ystar_base = a + np.dot(b,x.T)
epsilon = np.random.standard_normal(num_leg * num_bills).reshape(num_bills, num_leg)
ystar = ystar_base + epsilon
y = 1*(ystar >0)
### Initialize some stuff I need ###
avec = [0]*num_bills # These are bill parameters
bvec = [0]*num_bills
betavec = [np.matrix(zip(avec,bvec))]
xvec = [0]*num_leg # these are legislator parameters
_ones = np.ones(num_leg)
def init_y(mat): # initialize a latent y matrix
if mat==1: return truncnorm.rvs(0,10000)
else: return truncnorm.rvs(-10000,0)
vectorize_y = np.vectorize(init_y)
latent_y = np.matrix(vectorize_y(y))
burn = 500 # How long to run the MCMC
runs = 500
### define the functions ###
def sample_params(xnow,ynow): # This is the function I'd like to vectorize
if type(xnow) == list:
xnow = np.array(xnow)
if type(ynow) == list:
ynow = np.array(ynow)
ynow = ynow.T #reshape(ynow.shape[0],1)
sigma = np.linalg.inv(np.dot(xnow.T,xnow)) ###This is the line that produces an error###
xy = np.dot(xnow.T,ynow)
mu = np.dot(sigma, xy) # this is just (x'x)inv x'y
return np.random.multivariate_normal(np.array(mu).flatten(), sigma)
vecparams = np.vectorize(sample_params)
def get_mu(xnow, bnow): # getting the updated mean to draw the latent ys
if type(xnow) == list:
xnow = np.array(xnow)
if type(bnow) == list:
bnow = np.array(bnow)
mu = np.dot(xnow,bnow.T)
mu = np.matrix(mu)
return mu
def sample_y(mu, ynow): # generate latent y matrix
if ynow==1:
a, b = (0 - mu),(10000-mu)
else:
a, b = (-10000 - mu),(0-mu)
return truncnorm.rvs(a,b)
vector_sample = np.vectorize(sample_y) # I'd like to be able to do something like this
### Here's the MCMC loop with the internal loop over rows(bills)
for i in range(burn+runs):
this_beta = []
this_x = []
this_y = []
for j in range(num_bills): #I'd like to get rid of this loop
ex = zip(x_ones, x)
newbeta = sample_params(ex, latent_y[j])
this_beta.append(newbeta)
#ex = np.array(zip(x_ones, x))
#this_beta = vecparams(ex, latent_y[:,]) # and call the vectorized function here
betavec.append(this_beta)
#Note, I can vectorize the latent outputs easily enough here
mean = get_mu(ex, betavec[-1])
latent_y = np.matrix(vector_sample(mean, np.matrix(y).T).T.reshape(latent_y.shape[0], latent_y.shape[1]))
### Now a bit of code to check to see if I've recovered what I want ###
test_beta = [zip(*(z)) for z in betavec[burn:]]
test_a = np.array([z[0] for z in test_beta])
test_b = np.array([z[1] for z in test_beta])
amean = test_a.sum(axis = 0)/float(runs)
bmean = test_b.sum(axis = 0)/float(runs)
print 'a mean'
print np.corrcoef([amean, np.array(a)])
print
print 'b mean'
print np.corrcoef([bmean, np.array(b)])
If I comment out the loop and use the commented out lines just above, I get the following error at the line I indicated earlier (the one that defines sigma):
LinAlgError: 0-dimensional array given. Array must be at least two-dimensional